In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp  "/content/drive/My Drive/AML/Project/Data/preprocessed_full.csv" .

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data_full = pd.read_csv("preprocessed_full.csv")
#concert to string from object
data_full["Text"]= data_full["Text"].astype("str")

## Step 2.5: Train / Validate / Test Split
* 68% training set, 17% validation set, 15% test set

In [None]:
from sklearn.model_selection import train_test_split
X = data_full[["Text"]].values
y = data_full[["IsHate"]].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)
print(X_train.shape, X_validate.shape, X_test.shape)

(32403, 1) (8101, 1) (7148, 1)


## Step three: Feature Extraction
* word level: Ngram
* character level: 4 character n-grams 
* hybird: word2vec + character 4-grams

## 1. N-gram model
### N = 1,2,3
### max_features = 6000

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
column_col = data_full.columns.tolist()
train_df = pd.DataFrame(X_train,columns = [column_col[0]])
validate_df = pd.DataFrame(X_validate,columns = [column_col[0] ])


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word',binary = True, ngram_range=(1,2), max_features = 6000) 
X_train = vectorizer.fit_transform(train_df.Text).toarray()
X_validate = vectorizer.transform(validate_df.Text).toarray()
feature_col =  vectorizer.get_feature_names()
print(X_train)

X_train_df = pd.DataFrame(X_train, columns = feature_col)
X_validate_df = pd.DataFrame(X_validate, columns = feature_col  )
X_train_df["Label"] = y_train #add the y into it as well
X_validate_df["Label"] = y_validate
X_validate_df.head()

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Unnamed: 0,0xabad1dea,0xjared,0xjared zython86,10,100,1000,100046729,10084,10084 65039,1017_sqquad,1041204,1041204 1041204,1043323,1043359,11,11wdnick,12,120,127881,127911,127926,128049,128056,128056 9749,128064,128064 128064,128069,128069 128166,128072,128073,128074,128074 128074,128075,128076,128077,128078,128079,128079 128079,128080,128081,...,yesyouresexist,yet,yo,yo ass,yo bitch,yo girl,yo hoe,yo pussy,yo shit,yopapi_chulo,york,young,young bitch,yous,yousufpoosuf,yousufpoosuf hillaryguess,yousufpoosuf rkinglive2dance,youtube,youtube video,ypg,ypgypj,yr,yrs,yu,yuck,yum,yum mkr,yummy,yung,yup,yuskan0723,yusufpeaceful,zaibatsunews,zebra,zero,zip,zone,zython86,zython86 m_m_myers,Label
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0


In [None]:
#construct using max_features = 6000, having 6000 features 

def N_gram_construction(train_df,validate_df, y_train, y_validate,N):       
    vectorizer = CountVectorizer(analyzer='word',binary = True, ngram_range=(1,N), max_features = 6000) 
    X_train = vectorizer.fit_transform(train_df.Text).toarray()
    X_validate = vectorizer.transform(validate_df.Text).toarray()
    feature_col =  vectorizer.get_feature_names()
    print("====%s-gram feature extracting========"%(N))

    X_train_df = pd.DataFrame(X_train, columns = feature_col)
    X_validate_df = pd.DataFrame(X_validate, columns = feature_col  )
    X_train_df["Label"] = y_train #add the y into it as well
    X_validate_df["Label"] = y_validate
    
    X_train_df.to_csv("/content/drive/My Drive/AML/Project/Data/Train_%s_gram_prep.csv"%(N),index = False)
    X_validate_df.to_csv("/content/drive/My Drive/AML/Project/Data/Validate_%s_gram_prep.csv"%(N),index = False)
for i in [1,2,3]:
  N_gram_construction(train_df,validate_df, y_train, y_validate,i)

