In [1]:
import pandas as pd
import numpy as np

from matplotlib import figure
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')


mt = pd.read_csv('MeTooHate.csv')

[nltk_data] Downloading package stopwords to /Users/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}

<IPython.core.display.Javascript object>

# Me Too Hate Comments
***

The goal of this project is to seperate hateful and non-hateful tweets.

## Step 1: Cleaning the dataset

which we always do by first taking a look at the big picture:
- small view of the dataset
- check the size of the dataset

In [3]:
mt.head()

Unnamed: 0,status_id,text,created_at,favorite_count,retweet_count,location,followers_count,friends_count,statuses_count,category
0,1046207313588236290,"Entitled, obnoxious, defensive, lying weasel. ...",2018-09-30T01:17:15Z,5,1,"McAllen, TX",2253,2303,23856,0
1,1046207328113086464,Thank you and for what you did for the women...,2018-09-30T01:17:19Z,5,2,"Tampa, FL",2559,4989,19889,0
2,1046207329589493760,Knitting (s) &amp; getting ready for January 1...,2018-09-30T01:17:19Z,0,0,"St Cloud, MN",16,300,9,0
3,1046207341283168256,Yep just like triffeling women weaponized thei...,2018-09-30T01:17:22Z,1,0,flyover country,3573,3732,38361,1
4,1046207347016826880,"No, the President wants to end movement posin...",2018-09-30T01:17:23Z,0,0,World,294,312,7635,0


In [4]:
mt.shape

(807174, 10)

Ok, so now we've seen what the data is about and that we have a big dataset to work with.
***
Next step is looking voor missing values.

In [5]:
IsNull = mt.isnull().sum()
print(IsNull)

status_id               0
text                 3536
created_at              0
favorite_count          0
retweet_count           0
location           190768
followers_count         0
friends_count           0
statuses_count          0
category                0
dtype: int64


### My plan cleaning:
We'll remove some of the useless columns first to get rid of useless information, "Location" is also going to be removed because it has too many missing values and is also not usefull for finding hate and non-hate comments. Next, we remove the isnull rows from the "text" because these are also not usefull for our model. Without any input those rows cannot help predict.

In [6]:
#Remove useless columns
mt = mt.drop(['status_id','location', 'created_at',
        'followers_count', 'friends_count', 'statuses_count',
       ], axis=1)

In [7]:
mt = mt.dropna()
mt.shape

(803638, 4)

Well, that looks a little smaller :)
***
Next up we use the NLT (Natural Language Toolkit) to clean off punctuation, stopwords. 

In [8]:
import string
#removing punctuations

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punct = text.translate(translator)
    return text_without_punct

mt['text_without_punct'] = mt['text'].apply(remove_punctuation)
mt['text_without_punct']

0         Entitled obnoxious defensive lying weasel This...
1         Thank you  and  for what you did for the women...
2         Knitting s amp getting ready for January 19 20...
3         Yep just like triffeling women weaponized thei...
4         No the President wants to end  movement posing...
                                ...                        
807169    Let’s not forget that this “iconic kiss” was u...
807170    DEFINITELYthe only one any of us should suppor...
807171    Did the  movement count the dollars of Erin An...
807172    This is one of my all time fav songs amp video...
807173     I watched your news on the death of the sailo...
Name: text_without_punct, Length: 803638, dtype: object

In [9]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    text_without_stopwords = ' '.join(filtered_tokens)
    return text_without_stopwords

mt['text_without_stopwords'] = mt['text_without_punct'].apply(remove_stopwords)
mt['text_without_stopwords'] 

0         Entitled obnoxious defensive lying weasel thin...
1                                Thank women survivors week
2                Knitting amp getting ready January 19 2019
3         Yep like triffeling women weaponized poon Wond...
4              President wants end movement posing movement
                                ...                        
807169    Let ’ forget “ iconic kiss ” uninvited sexual ...
807170    DEFINITELYthe one us support unconditionally G...
807171        movement count dollars Erin Andrews wondering
807172    one time fav songs amp videos brutally honest ...
807173    watched news death sailor famous WW2 Kiss phot...
Name: text_without_stopwords, Length: 803638, dtype: object