In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# save for later
#from sklearn.decomposition import PCA
#import seaborn as sns
#import matplotlib.pyplot as plt
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
#from sklearn.linear_model import LogisticRegression
#from sklearn.datasets import make_classification
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import classification_report


This notebook will start off with preprocessing the two csv files to train different supervised learning models. 
- Removal of usernames, URLs, and special characters
- Lowercasing text
- Tokenization (nltk or spaCy): breaking text into smaller units 
- Stopword removal: remove common words that become index terms ("and", "or", "the", "in")
- Lemmatization: reduces words to their base or dictionary form
- TF-IDF vectorization for feature extraction: a technique that converts text data into numerical vectors, representing the importance of words in a document relative to a collection of documents, by combining term frequency with inverse document frequency

In [12]:
df = pd.read_csv("TrainingData/labeled_data.csv")
print(df.describe())
print(df.shape)
print(df.head())
print(df.info())


         Unnamed: 0         count   hate_speech  offensive_language  \
count  24783.000000  24783.000000  24783.000000        24783.000000   
mean   12681.192027      3.243473      0.280515            2.413711   
std     7299.553863      0.883060      0.631851            1.399459   
min        0.000000      3.000000      0.000000            0.000000   
25%     6372.500000      3.000000      0.000000            2.000000   
50%    12703.000000      3.000000      0.000000            3.000000   
75%    18995.500000      3.000000      0.000000            3.000000   
max    25296.000000      9.000000      7.000000            9.000000   

            neither         class  
count  24783.000000  24783.000000  
mean       0.549247      1.110277  
std        1.113299      0.462089  
min        0.000000      0.000000  
25%        0.000000      1.000000  
50%        0.000000      1.000000  
75%        0.000000      1.000000  
max        9.000000      2.000000  
(24783, 7)
   Unnamed: 0  count  hat

**count**: number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were

**hate_speech**: number of CF users who judged the tweet to be hate speech

**offensive_language**: number of CF users who judged the tweet to be offensive

**neither**: number of CF users who judged the tweet to be neither offensive nor non-offensive

**class**: class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither


In [27]:
# scrubbing text: removing usernames, URLs, special characters and ensuring all text is lowercase
tweet_column = df['tweet'].astype(str).str.casefold()  # lowercase
tweet_column.head()


0    !!! rt @mayasolovely: as a woman you shouldn't...
1    !!!!! rt @mleew17: boy dats cold...tyga dwn ba...
2    !!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...
3    !!!!!!!!! rt @c_g_anderson: @viva_based she lo...
4    !!!!!!!!!!!!! rt @shenikaroberts: the shit you...
Name: tweet, dtype: object

In [None]:
#removes usernames first, urls, then any special characters
clean_tweet = tweet_column.str.replace(r'@\w+:?', ' ', regex=True).str.replace(r'http.+', ',', regex=True).str.replace(r'\W+', ',', regex=True)
clean_tweet.head()

0    ,rt,as,a,woman,you,shouldn,t,complain,about,cl...
1    ,rt,boy,dats,cold,tyga,dwn,bad,for,cuffin,dat,...
2    ,rt,dawg,rt,you,ever,fuck,a,bitch,and,she,star...
3                           ,rt,she,look,like,a,tranny
4    ,rt,the,shit,you,hear,about,me,might,be,true,o...
Name: tweet, dtype: object

In [None]:
# tokenize