In [1]:
import pandas as pd 
import numpy as np
import os
import re # regualr expression module

### 1. Load csv data

In [2]:
inp_tweets = pd.read_csv('Twitterhate.csv')
inp_tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
inp_tweets.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [6]:
inp_tweets.tweet.sample().values[0]

"registered for fall classes! hopefully graphic and web goes as well as admin profesh and i make the dean's list each semester!  "

### 2. Get the tweets into a list, for easy text clean up and manipulation

In [8]:
tweetList = inp_tweets.tweet.values
tweetList

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty', ...,
       'listening to sad songs on a monday morning otw to work is sad  ',
       '@user #sikh #temple vandalised in in #calgary, #wso condemns  act  ',
       'thank you @user for you follow  '], dtype=object)

In [9]:
len(tweetList)

31962

In [10]:
tweetList[0:5]

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty',
       '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
       ' factsguide: society now    #motivation'], dtype=object)

The tweets contain -

URLs,

Hashtags,

User handles,

'RT'

### 3. Clean Up

#### 3.1 Normalizing case - to lower case

In [13]:
tweetsLower = [twt.lower() for twt in tweetList]
tweetsLower[0:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

#### 3.2 Remove user handles, begin with '@'

In [52]:
re.sub('@\w+', '', '@anand.khare')

'.khare'

In [21]:
tweetsNoUser = [re.sub('@\w+', '', twt) for twt in tweetsLower]
tweetsNoUser[0:10]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation',
 '[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  ',
 '  camping tomorrow        dannyâ\x80¦',
 "the next school year is the year for exams.ð\x9f\x98¯ can't think about that ð\x9f\x98\xad #school #exams   #hate #imagine #actorslife #revolutionschool #girl",
 'we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â\x80¦ ',
 "   welcome here !  i'm   it's so #gr8 ! "]

#### 3.3 Remove URLs

In [19]:
re.sub('\w+://\S+', '', 'Use this link : https://anand.khare')

'Use this link : '

In [22]:
tweetsNoUrl = [re.sub('\w+://\S+', '', twt) for twt in tweetsNoUser]
tweetsNoUrl[0:10]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation',
 '[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  ',
 '  camping tomorrow        dannyâ\x80¦',
 "the next school year is the year for exams.ð\x9f\x98¯ can't think about that ð\x9f\x98\xad #school #exams   #hate #imagine #actorslife #revolutionschool #girl",
 'we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â\x80¦ ',
 "   welcome here !  i'm   it's so #gr8 ! "]

### 3.4 Tokenze using Tweet Tokenizer from NLTK

In [23]:
from nltk.tokenize import TweetTokenizer

In [25]:
tkn = TweetTokenizer()

In [27]:
print(tkn.tokenize(tweetsNoUrl[0]))

['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run']


In [31]:
tweetsToken = [tkn.tokenize(twt) for twt in tweetsNoUrl]

In [32]:
tweetsToken[0:1]

[['when',
  'a',
  'father',
  'is',
  'dysfunctional',
  'and',
  'is',
  'so',
  'selfish',
  'he',
  'drags',
  'his',
  'kids',
  'into',
  'his',
  'dysfunction',
  '.',
  '#run']]

### 3.5 Remove punctuations and stop words and other redundant terms tike 'rt', 'amp'¶

Also remove hashtags

In [36]:
from nltk.corpus import stopwords
from string import punctuation

In [38]:
stop_nltk = stopwords.words('english')
stop_punctn = list(punctuation)

In [39]:
stop_punctn.extend(['...', '``', "''", ".."])

In [40]:
stop_context = ['rt', 'amp']

In [42]:
stop_final =stop_nltk + stop_punctn + stop_context

Funtion to
        remove stop words from a single tokenized sentence
        remove # tags
        remove terms with length = 1

In [45]:
def del_stop(sent):
    return [re.sub("#","",term) for term in sent if ((term not in stop_final) & (len(term)>1))]

In [46]:
del_stop(tweetsToken[4])

['factsguide', 'society', 'motivation']

In [49]:
tweetClean = [del_stop(twt) for twt in tweetsToken]

In [50]:
tweetClean[0:5]

[['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run'],
 ['thanks',
  'lyft',
  'credit',
  "can't",
  'use',
  'cause',
  'offer',
  'wheelchair',
  'vans',
  'pdx',
  'disapointed',
  'getthanked'],
 ['bihday', 'majesty'],
 ['model', 'love', 'take', 'time', 'urð'],
 ['factsguide', 'society', 'motivation']]

### 5. Check out the top terms in the tweets:

In [53]:
from collections import Counter

In [56]:
term_list = []
for twt in tweetClean:
    term_list.extend(twt)

In [58]:
res = Counter(term_list)
res.most_common(10)

[('love', 2748),
 ('day', 2276),
 ('happy', 1684),
 ('time', 1131),
 ('life', 1118),
 ('like', 1047),
 ("i'm", 1018),
 ('today', 1013),
 ('new', 994),
 ('thankful', 946)]

### 6. Data formatting for predictive modeling:

#### 6.1 Join the tokens back into strings

In [61]:
tweetClean[0]

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

In [89]:
tweetCleanJoined = ["".join(twt) for twt in tweetClean]

In [90]:
tweetCleanJoined[0]

'father dysfunctional selfish drags kids dysfunction run'

#### 6.2 Separate X and Y and perform train test split, 70-30

In [91]:
len(tweetCleanJoined)

31962

In [92]:
len(inp_tweets.label)

31962

In [93]:
X = tweetCleanJoined
y = inp_tweets.label.values

Train test split

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [95]:
print(len(X_train), len(X_test))
print(len(y_train), len(y_test))

22373 9589
22373 9589


### 7. Use TF-IDF values for the terms as a feature to get into a vector space model.

In [96]:
# Import TF-IDF  vectorizer from sklearn. 
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
#Instantiate with a maximum of 5000 terms in your vocabulary.
vectorizer = TfidfVectorizer(max_features=5000)

In [98]:
# Train Model
len(X_train), len(X_test)

(22373, 9589)

In [99]:
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [100]:
X_train_bow.shape, X_test_bow.shape

((22373, 5000), (9589, 5000))

### 8. Model building: Ordinary Logistic Regression

In [101]:
from sklearn.linear_model import LogisticRegression

In [102]:
logReg = LogisticRegression()
logReg.fit(X_train_bow, y_train)

LogisticRegression()

In [103]:
y_train_pred = logReg.predict(X_train_bow)
y_test_pred = logReg.predict(X_test_bow)

### 9. Model evaluation: Accuracy, recall, and f_1 score.

In [104]:
from sklearn.metrics import accuracy_score, classification_report

In [105]:
accuracy_score(y_train, y_train_pred)

0.9560184150538595

In [107]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     20815
           1       0.96      0.39      0.55      1558

    accuracy                           0.96     22373
   macro avg       0.96      0.69      0.76     22373
weighted avg       0.96      0.96      0.95     22373



Looking at precision recall and f1-score values model seems to focus on '0'

### 9. adjust the class imbalance, as the model seems to focus on the 0s.

In [108]:
logRegBal = LogisticRegression(class_weight = 'balanced')
logRegBal.fit(X_train_bow, y_train)

LogisticRegression(class_weight='balanced')

In [109]:
y_train_pred = logRegBal.predict(X_train_bow)
y_test_pred = logRegBal.predict(X_test_bow)

In [110]:
accuracy_score(y_train, y_train_pred)

0.9527108568363652

In [111]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20815
           1       0.60      0.97      0.74      1558

    accuracy                           0.95     22373
   macro avg       0.80      0.96      0.86     22373
weighted avg       0.97      0.95      0.96     22373



### 10. Regularization and Hyperparameter tuning:

In [117]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [126]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'C' : [0.01, 0.1, 1, 10, 100],
    'penalty' : ["l1","l2"]
}

In [127]:
lrModelK = LogisticRegression(class_weight = 'balanced')

Find the parameters with the best recall in cross validation.

Choose ‘recall’ as the metric for scoring.

Choose stratified 4 fold cross validation scheme.

Fit into  the train set.

In [128]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = lrModelK, 
                           param_grid = param_grid,
                           cv = StratifiedKFold(4), 
                           n_jobs = -1, 
                           verbose = 1, 
                           scoring = 'recall')

In [129]:
grid_search.fit(X_train_bow, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\anand.khare\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\anand.khare\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\anand.khare\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        na

GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(class_weight='balanced'), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall', verbose=1)

In [130]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight='balanced')

### 11. Predict and evaluate using the best estimator.

Use the best estimator from the grid search to make predictions on the test set.

What is the recall on the test set for the toxic comments?

What is the f_1 score?

In [131]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [132]:
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [135]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20815
           1       0.60      0.97      0.74      1558

    accuracy                           0.95     22373
   macro avg       0.80      0.96      0.86     22373
weighted avg       0.97      0.95      0.96     22373



In [133]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8905
           1       0.49      0.77      0.60       684

    accuracy                           0.93      9589
   macro avg       0.73      0.85      0.78      9589
weighted avg       0.95      0.93      0.93      9589

