#            Natural Language Processing (NLP)

## Project 2
### Help Twitter Combat Hate Speech Using NLP and Machine Learning

#### Import necessory Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import re

### Read the csv file using pandas

In [2]:
tweets0 = pd.read_csv(r'F:\._LEARNING\AI\_6. NLP\project\TwitterHate.csv')

In [3]:
tweets0.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
tweets0.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

## Get the tweets into a list, for easy text clean up and manipulation

In [5]:
tweets1 = tweets0.tweet.values

In [6]:
len(tweets1)

31962

In [7]:
tweets1[:5]

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty',
       '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
       ' factsguide: society now    #motivation'], dtype=object)

The tweets contain -

1 URLs
2 Hashtags
3 User handles
4 'RT'

## Cleanup

Normalizing case

In [8]:
tweets_lower = [twt.lower() for twt in tweets1]

In [9]:
tweets_lower[:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

### Remove user handles, begin with '@'

In [10]:
re.sub("@\w+","", "@Rahim this course rocks! http://rahimbaig.com/ai")

' this course rocks! http://rahimbaig.com/ai'

In [11]:
tweets_nouser = [re.sub("@\w+","", twt) for twt in tweets_lower]

In [12]:
tweets_nouser[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

### Remove URLs

In [13]:
re.sub("\w+://\S+","", "@Rahim this course rocks! http://rahimbaig.com/ai")

'@Rahim this course rocks! '

In [14]:
tweets_nourl = [re.sub("\w+://\S+","", twt) for twt in tweets_nouser]

In [15]:
tweets_nourl[:5]

['  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "  thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

### Tokenze using Tweet Tokenizer from NLTK

In [16]:
from nltk.tokenize import TweetTokenizer
tkn = TweetTokenizer()


In [17]:
tweet_token = [tkn.tokenize(sent) for sent in tweets_nourl]
print(tweet_token[0])


['when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#run']


### Remove punctuations and stop words and other redundant terms tike 'rt', 'amp' Also remove hashtags

In [18]:
from nltk.corpus import stopwords
from string import punctuation


In [19]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)


In [20]:
stop_punct.extend(['...','``',"''",".."])

In [21]:
stop_context = ['rt', 'amp']

In [22]:
stop_final = stop_nltk + stop_punct + stop_context

Function to
remove stop words from a single tokenized sentence
remove # tags
remove terms with length = 1

In [23]:
def del_stop(sent):
    return [re.sub("#","",term) for term in sent if ((term not in stop_final) & (len(term)>1))]


In [24]:
tweets_clean = [del_stop(tweet) for tweet in tweet_token]

In [25]:
tweets_clean[0]

['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']

### Check out the top terms in the tweets

In [26]:
from collections import Counter

In [27]:
term_list = []
for tweet in tweets_clean:
    term_list.extend(tweet)


In [28]:
res = Counter(term_list)
res.most_common(10)


[('love', 2748),
 ('day', 2276),
 ('happy', 1684),
 ('time', 1131),
 ('life', 1118),
 ('like', 1047),
 ("i'm", 1018),
 ('today', 1013),
 ('new', 994),
 ('thankful', 946)]

## Data formatting for predictive modeling
Join the tokens back into strings

In [29]:
tweets_clean = [" ".join(tweet) for tweet in tweets_clean]

In [30]:
len(tweets_clean), len(tweets_clean)

(31962, 31962)

### Separate X and Y and perform train test split

In [31]:
X = tweets_clean
y = tweets0.label.values


In [32]:
# Import train_test_split Library 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=2)


### Create a document term matrix using count vectorizer

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vectorizer = TfidfVectorizer(max_features = 5000)

In [35]:
X_train_bow = vectorizer.fit_transform(X_train)

In [36]:
X_test_bow = vectorizer.transform(X_test)
X_train_bow.shape, X_test_bow.shape


((22373, 5000), (9589, 5000))

## Model building

#### Using a simple Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()


In [38]:
logreg.fit(X_train_bow, y_train)

LogisticRegression()

In [39]:
y_train_pred = logreg.predict(X_train_bow)
y_test_pred = logreg.predict(X_test_bow)


In [40]:
from sklearn.metrics import accuracy_score, classification_report

In [41]:
accuracy_score(y_train, y_train_pred)

0.9544987261431189

In [42]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98     20810
           1       0.96      0.36      0.53      1563

    accuracy                           0.95     22373
   macro avg       0.96      0.68      0.75     22373
weighted avg       0.95      0.95      0.94     22373



### Adjusting for class imbalance

In [43]:
logreg = LogisticRegression(class_weight="balanced")

In [44]:
logreg.fit(X_train_bow, y_train)

LogisticRegression(class_weight='balanced')

In [45]:
y_train_pred = logreg.predict(X_train_bow)
accuracy_score(y_train, y_train_pred)


0.9521297993116703

In [46]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     20810
           1       0.60      0.98      0.74      1563

    accuracy                           0.95     22373
   macro avg       0.80      0.96      0.86     22373
weighted avg       0.97      0.95      0.96     22373



In [47]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [48]:
# Create the parameter grid based on the results of random search 
param_grid = {
                'C': [0.01,0.1,1,10,100],
                'penalty': ["l1","l2"]}


In [49]:
classifier_lr = LogisticRegression(class_weight="balanced")

In [50]:
grid_search = GridSearchCV(estimator = classifier_lr, param_grid = param_grid, 

cv = StratifiedKFold(4), n_jobs = -1, verbose = 1, scoring = "recall" )


In [51]:
grid_search.fit(X_train_bow, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    5.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(class_weight='balanced'), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall', verbose=1)

In [52]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight='balanced')

### Using the best estimator to make predictions on the test set


In [53]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8910
           1       0.47      0.75      0.58       679

    accuracy                           0.92      9589
   macro avg       0.73      0.84      0.77      9589
weighted avg       0.94      0.92      0.93      9589



##### Looks like you did a good job on the test set. The f1_score for 1 class is 0.58 and the recall is 0.75. Great!
#### and accuracy for F1_score  is 0.92