In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score
from nltk.tokenize import TweetTokenizer
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

#### 1. Load the tweets file using read_csv function from Pandas package. 

In [3]:
data = pd.read_csv('TwitterHate.csv')

In [4]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
data.drop('id', axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [7]:
data.shape

(31962, 2)

In [8]:
data.isnull().any()

label    False
tweet    False
dtype: bool

In [9]:
data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [10]:
# Checking for Imbalance

(data.label.value_counts()[1]/data.shape[0])*100

7.014579813528565

In [11]:
(data.label.value_counts()[0]/data.shape[0])*100

92.98542018647143

### Conclusion - The data is Imbalanced

## Preprocessing

#### 2. Get the tweets into a list for easy text cleanup and manipulation.

In [12]:
tweets = data.tweet

In [13]:
tweets[0:5]

0     @user when a father is dysfunctional and is s...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model   i love u take with u all the time in ...
4               factsguide: society now    #motivation
Name: tweet, dtype: object

#### 3. To cleanup: 

###### 1. Normalize the casing.
###### 2. Using regular expressions, remove user handles. These begin with '@’.
###### 3. Using regular expressions, remove URLs.
###### 4. Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.
###### 5. Remove stop words.
###### 6. Remove redundant terms like ‘amp’, ‘rt’, etc.
###### 7. Remove ‘#’ symbols from the tweet while retaining the term.

#### 4. Extra cleanup by removing terms with a length of 1.


In [14]:
stop_words = stopwords.words('english')

In [15]:
def clean_data(tweets):
    # 3.1. Normalize the casing
    tweets = tweets.apply(lambda x: x.lower())
    
    # 3.2. Using regular expressions, remove user handles. These begin with '@’.
    tweets = tweets.apply(lambda x: re.sub('@\w+', '', x))
    
    # 3.3 Using regular expressions, remove URLs.
    url_pattern = '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
    tweets = tweets.apply(lambda x: re.sub(url_pattern, '', x))
    
    # 3.7. Remove ‘#’ symbols from the tweet while retaining the term.
    tweets = tweets.apply(lambda x: re.sub('#','',x))
    
    # Removing extra '....'
    tweets = tweets.apply(lambda x: re.sub('\.+', '', x))
    
    # 3.4. Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.
    tweets = tweets.apply(lambda x: TweetTokenizer().tokenize(x))
    
    # 3.5. Remove stop words.
    # 3.6. Remove redundant terms like ‘amp’, ‘rt’, etc.
    # 4. Extra cleanup by removing terms with a length of 1.
    tweets = tweets.apply(lambda x: [words for words in x if words not in stop_words and len(words)>1 and words not in ['amp', 'rt'] and words not in string.punctuation])

    
    return tweets

In [16]:
tweets = clean_data(tweets)

In [17]:
tweets

0        [father, dysfunctional, selfish, drags, kids, ...
1        [thanks, lyft, credit, can't, use, cause, offe...
2                                        [bihday, majesty]
3                           [model, love, take, time, urð]
4                        [factsguide, society, motivation]
                               ...                        
31957                                    [ate, isz, youuu]
31958    [see, nina, turner, airwaves, trying, wrap, ma...
31959    [listening, sad, songs, monday, morning, otw, ...
31960    [sikh, temple, vandalised, calgary, wso, conde...
31961                                      [thank, follow]
Name: tweet, Length: 31962, dtype: object

## 5. Check out the top terms in the tweets:
##### 5.1. First, get all the tokenized terms into one large list.
##### 5.2. Use the counter and find the 10 most common terms.

In [18]:
tweets[0]

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

In [19]:
# 5.1. First, get all the tokenized terms into one large list.
tokens = [j for i in tweets for j in i]

In [20]:
# 5.2. Use the counter and find the 10 most common terms.

counter = Counter(tokens)

In [21]:
counter = dict(sorted(counter.items(),key= lambda x:x[1], reverse=True)[:10])
counter

{'love': 2667,
 'day': 2249,
 'happy': 1678,
 'time': 1115,
 'like': 1095,
 'life': 1094,
 "i'm": 1011,
 'today': 1002,
 'new': 987,
 'positive': 928}

### 6. Data formatting for predictive modeling:

###### 1. Join the tokens back to form strings. This will be required for the vectorizers.
###### 2. Assign x and y.
###### 3. Perform train_test_split using sklearn.


In [22]:
# 6.1. Join the tokens back to form strings. This will be required for the vectorizers.

tweets = tweets.apply(lambda x: ' '.join(x))

In [23]:
tweets[0:5]

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit can't use cause offer wheel...
2                                       bihday majesty
3                             model love take time urð
4                        factsguide society motivation
Name: tweet, dtype: object

In [24]:
# 6.2. Assign x and y.

X = tweets
y = data.label

In [25]:
# 6.3. Perform train_test_split using sklearn.

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)

In [26]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(22373,)
(22373,)
(9589,)
(9589,)


### 7. We’ll use TF-IDF values for the terms as a feature to get into a vector space model.
###### 1. Import TF-IDF  vectorizer from sklearn.
###### 2. Instantiate with a maximum of 5000 terms in your vocabulary.
###### 3. Fit and apply on the train set.
###### 4. Apply on the test set.


In [27]:
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(X_train)

TfidfVectorizer(max_features=5000)

In [28]:
feature_names = tfidf_vect.get_feature_names()

X_train_dtm = tfidf_vect.transform(X_train)
X_train_dtm_df = pd.DataFrame(X_train_dtm.toarray(), columns=feature_names)

In [29]:
X_test_dtm = tfidf_vect.transform(X_test)
X_test_dtm_df = pd.DataFrame(X_test_dtm.toarray(), columns=feature_names)

### 8. Model building: Ordinary Logistic Regression

###### 1. Instantiate Logistic Regression from sklearn with default parameters.
###### 2. Fit into  the train data.
###### 3. Make predictions for the train and the test set.


In [30]:
lr_reg = LogisticRegression()
lr_reg.fit(X_train_dtm_df, y_train)

LogisticRegression()

In [31]:
test_pred = lr_reg.predict(X_test_dtm_df)

In [32]:
train_pred = lr_reg.predict(X_train_dtm_df)

### 9. Model evaluation: Accuracy, recall, and f_1 score.

###### 1. Report the accuracy on the train set.
###### 2. Report the recall on the train set: decent, high, or low.
###### 3. Get the f1 score on the train set.


In [33]:
print('Test Accuracy: ',accuracy_score(y_test, test_pred))
print('Train Accuracy: ', accuracy_score(y_train, train_pred))

Test Accuracy:  0.9512983627072688
Train Accuracy:  0.9551691771331515


In [34]:
print('------ Classification Report for Train ------- \n\n')

print(classification_report(y_train, train_pred))

------ Classification Report for Train ------- 


              precision    recall  f1-score   support

           0       0.96      1.00      0.98     20780
           1       0.96      0.39      0.55      1593

    accuracy                           0.96     22373
   macro avg       0.96      0.69      0.76     22373
weighted avg       0.96      0.96      0.95     22373



In [35]:
print('------ Classification Report for Test ------- \n\n')

print(classification_report(y_test, test_pred))

------ Classification Report for Test ------- 


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8940
           1       0.89      0.32      0.47       649

    accuracy                           0.95      9589
   macro avg       0.92      0.66      0.72      9589
weighted avg       0.95      0.95      0.94      9589



###### Accuracy on the train set is : 95%
###### Recall on the Train set is: 100% for Non-Toxic and 32% for Toxic, which is low
###### f1 score for Train set is: 97% for Nob-Toxic and 47% for Toxic, which is low

### 10. Looks like you need to adjust the class imbalance, as the model seems to focus on the 0s.
###### 1. Adjust the appropriate class in the LogisticRegression model.


In [36]:
from imblearn.over_sampling import SMOTE

In [37]:
sm = SMOTE()
X_train_dtm_df, y_train = sm.fit_resample(X_train_dtm_df, y_train)

In [38]:
print(X_train_dtm_df.shape)
print(y_train.shape)

(41560, 5000)
(41560,)


### 11. Train again with the adjustment and evaluate.

###### 1. Train the model on the train set.
###### 2. Evaluate the predictions on the train set: accuracy, recall, and f_1 score.


In [39]:
# 11.1. Train the model on the train set.

lr_reg_sm = LogisticRegression()
lr_reg_sm.fit(X_train_dtm_df, y_train)

LogisticRegression()

In [40]:
# 11.2. Evaluate the predictions on the train set: accuracy, recall, and f_1 score.

train_pred = lr_reg_sm.predict(X_train_dtm_df)

In [41]:
print('Train Accuracy: ', accuracy_score(y_train, train_pred))

Train Accuracy:  0.9564244465832531


In [42]:
print('------ Classification Report for Train ------- \n\n')

print(classification_report(y_train, train_pred))

------ Classification Report for Train ------- 


              precision    recall  f1-score   support

           0       0.98      0.93      0.96     20780
           1       0.93      0.98      0.96     20780

    accuracy                           0.96     41560
   macro avg       0.96      0.96      0.96     41560
weighted avg       0.96      0.96      0.96     41560



### Conclusion:
###### Accuracy on the train set is : 96%
###### Recall on the Train set is: 93% for Non-Toxic and 98% for Toxic, which is high
###### f1 score for Train set is: 96% for Non-Toxic and 96% for Toxic, which is high

In [43]:
print('------ Classification Report for Test ------- \n\n')

print(classification_report(y_test, test_pred))

------ Classification Report for Test ------- 


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8940
           1       0.89      0.32      0.47       649

    accuracy                           0.95      9589
   macro avg       0.92      0.66      0.72      9589
weighted avg       0.95      0.95      0.94      9589



In [44]:
print(confusion_matrix(y_test, test_pred))

[[8914   26]
 [ 441  208]]


### 12. Regularization and Hyperparameter tuning:

###### 1. Import GridSearch and StratifiedKFold because of class imbalance.

###### 2. Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.

###### 3. Use a balanced class weight while instantiating the logistic regression.

In [45]:
# 1. Import GridSearch and StratifiedKFold because of class imbalance.

from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [46]:
# 2. Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters.

grid = {"C": np.logspace(-3,3,7), "penalty": ["l1","l2"]}

### 13. Find the parameters with the best recall in cross validation.

###### 1. Choose ‘recall’ as the metric for scoring.

###### 2. Choose stratified 4 fold cross validation scheme.

###### 3. Fit into  the train set.

In [47]:
lr_model = LogisticRegression()
gd_lr = GridSearchCV(lr_model, param_grid=grid, scoring="recall", cv=4)
gd_lr.fit(X_train_dtm_df, y_train)

GridSearchCV(cv=4, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             scoring='recall')

### 14. What are the best parameters?

In [48]:
gd_lr.best_params_

{'C': 1000.0, 'penalty': 'l2'}

### 15. Predict and evaluate using the best estimator.

###### 1. Use the best estimator from the grid search to make predictions on the test set.

###### 2. What is the recall on the test set for the toxic comments?

###### 3. What is the f_1 score?

In [49]:
# 1. Use the best estimator from the grid search to make predictions on the test set.

lr_model = LogisticRegression(C=1000.0, penalty='l2')
lr_model.fit(X_train_dtm_df, y_train)

LogisticRegression(C=1000.0)

In [50]:
pred_test = lr_model.predict(X_test_dtm_df)

In [51]:
# 2. What is the recall on the test set for the toxic comments?

print("Classification Report on Test: \n\n")
print(classification_report(y_test, pred_test))

Classification Report on Test: 


              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8940
           1       0.44      0.67      0.53       649

    accuracy                           0.92      9589
   macro avg       0.71      0.80      0.74      9589
weighted avg       0.94      0.92      0.93      9589



### Ans: Recall on the Toxic Comments is: 67%

### 3. What is the f_1 score?

##### Ans: f_1 score for Toxic comments is: 53% and for non-toxic comment is: 96%