In [1]:
import numpy as np
import pandas as pd # for importing .csv file and converting it into Dataframe
from sklearn.model_selection import train_test_split # for splitting the data into train and test set
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression # importing logistic regression
from sklearn.metrics import accuracy_score # for checking the accuracy of the model

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Task 1:** Load tweets file using read_csv function from Pandas

In [2]:
twitter_data=pd.read_csv('/content/drive/MyDrive/AI/NLP/assesment_projects/proj_2/TwitterHate.csv')
twitter_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
twitter_data['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [4]:
twitter_data['label'].value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

**Comments:** 93% of the tweets are non-hate and 7% of the tweets are Hate speech

In [5]:
twitter_data=twitter_data.drop(labels=['id'],axis=1)

In [6]:
twitter_data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


**Task 2:**Get the tweets into a list for easy text cleanup and manipulation.

In [7]:
tweet_list=twitter_data['tweet'].tolist()
type(tweet_list)

list

**Task 3:** To Cleanup

**3-1:** Normalize the Casing

In [8]:
for i in range(len(tweet_list)):
  tweet_list[i]=tweet_list[i].lower()

tweet_list[:5]

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

**3-2:** Using Regular expressions, remove user handles. These begin with '@'

In [9]:
for i in range(len(tweet_list)):
  tweet_list[i]=tweet_list[i].replace('@','')

tweet_list[:5]

[' user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "user user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

**Comment:** @ is replaced with ''

**3-3:** Using Regular Expression remove URLs

In [10]:
count=0
for i in range(len(tweet_list)):
  string=tweet_list[i]
  check_string='urð'
  if check_string in string:
    count=count+1
  else:
    count=count

print('There are {} tweets with urls'.format(count))

There are 325 tweets with urls


In [11]:
for i in range(len(tweet_list)):
  tweet_list[i]=tweet_list[i].replace('urÃ°ÂŸÂ“Â±!!! Ã°ÂŸÂ˜Â™Ã°ÂŸÂ˜ÂŽÃ°ÂŸÂ‘Â„Ã°ÂŸÂ‘Â…Ã°ÂŸÂ’Â¦Ã°ÂŸÂ’Â¦Ã°ÂŸÂ’Â¦',' ')

count=0
for i in range(len(tweet_list)):
  string = tweet_list[i]
  check_string='urÃ'
  if check_string in string:
    count=count+1
  else:
    count=count

print('There are {} tweets with urls'.format(count))

There are 0 tweets with urls


**3-4:** Using tweet tokenizer from NLTK tokenize the tweets in to indivudual words

In [12]:
from nltk import TweetTokenizer
tokenize = TweetTokenizer()
tweet_words=[]
for i in range (len(tweet_list)):
  words=tokenize.tokenize(tweet_list[i])
  tweet_words.append(words)

In [13]:
tweet_words[:2]

[['user',
  'when',
  'a',
  'father',
  'is',
  'dysfunctional',
  'and',
  'is',
  'so',
  'selfish',
  'he',
  'drags',
  'his',
  'kids',
  'into',
  'his',
  'dysfunction',
  '.',
  '#run'],
 ['user',
  'user',
  'thanks',
  'for',
  '#lyft',
  'credit',
  'i',
  "can't",
  'use',
  'cause',
  'they',
  "don't",
  'offer',
  'wheelchair',
  'vans',
  'in',
  'pdx',
  '.',
  '#disapointed',
  '#getthanked']]

**3-5** Remove stop words.

In [14]:
stopwords=nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [15]:
import re
import string
def remove_stopwords(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens=re.split('\W+', text)
  text=[ps.stem(word) for word in tokens if word not in stopwords]
  return text

tweets_list_tokens = remove_stopwords(tweet_list)

**3-6** Remove redundant terms like ‘amp’, ‘rt’, etc.

In [16]:
for i in range(len(tweet_list)):
    tweet_list[i]=tweet_list[i].replace('amp',' ')

In [17]:
count = 0
for i in range(len(tweets_list_tokens)):
    count = count+1
    tweets_list_tokens[i] = tweets_list_tokens[i].replace('amp',' ')
    tweets_list_tokens[i] = tweets_list_tokens[i].replace('rt',' ')
    tweets_list_tokens[i] = tweets_list_tokens[i].replace('etc',' ')

tweets_list_tokens[:5]
count

293379

In [18]:
tweets_list_tokens[:5]

['', 'user', 'father', 'dysfunct', 'selfish']

**3-7** Remove ‘#’ symbols from the tweet while retaining the term.

In [19]:
for i in range(len(tweets_list_tokens)):
  tweets_list_tokens[i]=tweets_list_tokens[i].replace('#',' ')

**Task 4:** Extra cleanup by removing terms with a length of 1.

In [20]:
for i in range(len(tweets_list_tokens)):
  if len(tweets_list_tokens[i])<=1:
    tweets_list_tokens[i]=''

len(tweets_list_tokens)

293379

In [21]:
count=0
for i in range(len(tweets_list_tokens)):
  if tweets_list_tokens[i]=='':
    count=count+1

print(count)

29430


In [22]:
def remove_values_from_list(the_list, val):
    return [value for value in the_list if value != val]

In [23]:
#Tweets_List_Tokens = list(filter(lambda a: a != 999999999, Tweets_List_Tokens))
tweets_list_tokens = remove_values_from_list(tweets_list_tokens, '')
len(tweets_list_tokens)

263949

**Task-5:** Check out the top terms in the tweets

**5-1:** First, get all the tokenized terms into one large list.

In [24]:
type(tweets_list_tokens)

list

**Comments:** It is already in list, so, there is no need to convert

**5-2:** Use the counter and find the 10 most common terms.

In [25]:
import collections

tweets_list_tokens_count = collections.Counter(tweets_list_tokens)
[(l,k) for k,l in sorted([(j,i) for i,j in tweets_list_tokens_count.items()], reverse=True)][0:10]

[('user', 16497),
 ('love', 3101),
 ('day', 2752),
 ('happi', 1983),
 ('thank', 1548),
 ('get', 1244),
 ('time', 1241),
 ('go', 1135),
 ('like', 1103),
 ('life', 1096)]

**Task 6:** Data formatting for predictive modeling

**6-1:** Join the tokens back to form strings. This will be required for the vectorizers.

**6-2:** Assign x and y.

In [26]:
twitter_data.columns

Index(['label', 'tweet'], dtype='object')

In [27]:
X=twitter_data['tweet']
y=twitter_data['label']

**6-3:** Perform train_test_split using sklearn

In [28]:
from sklearn.model_selection import train_test_split
(X_train,X_test,y_train,y_test)=train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
print(X_train.shape)

(25569,)


**Task 7:** We’ll use TF-IDF values for the terms as a feature to get into a vector space model.

**7-1:** Import TF-IDF  vectorizer from sklearn.

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

**7-2:** Instantiate with a maximum of 5000 terms in your vocabulary.

In [31]:
tfidf_vector=TfidfVectorizer(analyzer='word')

**7-3:** Fit and apply on the train set

In [32]:
tfidf_vector.fit(X_train)
X_train_tfidf=tfidf_vector.transform(X_train)

In [33]:
feature_names=tfidf_vector.get_feature_names()
pd.DataFrame(X_train_tfidf.toarray(),columns=feature_names)



Unnamed: 0,00,000,000001,001,0099,00am,00h30,01,0115,02,0266808099,03,030916,03111880779,033,0345,039,04,0450,04pm,05,05pm,06,0606,0609,0610,0612,0613,0616,0617,0618,0618â,0619,064,07,07788427999,08,09,09053111156,095m,...,æµ,ç²½å,çµ,ç½,ç¾,èª,è¾¹,è¾¼,ê²,êµ,ëª,ë³,ë¹,ì¹,îµï½,ï¼,ï½,ï¾,ð²ð½ñ,ð²ð¾ñ,ð²ñ,ðµð,ðµð¹,ðµð½ð,ðµð½ñ,ð¹,ð¹ð,ð¹ñ,ðº,ðºð,ðºð¾ð,ðºð¾ñ,ð¼ð,ð½ð,ð¾ð,ð¾ð½ð,ð¾ð½ðµð,ð¾ñ,ó¾,ø¹ù
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**7-4:** Apply on the test data 

In [34]:
X_test_tfidf=tfidf_vector.transform(X_test)

In [35]:
feature_names_test=tfidf_vector.get_feature_names()
pd.DataFrame(X_test_tfidf.toarray(),columns=feature_names_test)



Unnamed: 0,00,000,000001,001,0099,00am,00h30,01,0115,02,0266808099,03,030916,03111880779,033,0345,039,04,0450,04pm,05,05pm,06,0606,0609,0610,0612,0613,0616,0617,0618,0618â,0619,064,07,07788427999,08,09,09053111156,095m,...,æµ,ç²½å,çµ,ç½,ç¾,èª,è¾¹,è¾¼,ê²,êµ,ëª,ë³,ë¹,ì¹,îµï½,ï¼,ï½,ï¾,ð²ð½ñ,ð²ð¾ñ,ð²ñ,ðµð,ðµð¹,ðµð½ð,ðµð½ñ,ð¹,ð¹ð,ð¹ñ,ðº,ðºð,ðºð¾ð,ðºð¾ñ,ð¼ð,ð½ð,ð¾ð,ð¾ð½ð,ð¾ð½ðµð,ð¾ñ,ó¾,ø¹ù
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Task 8:** Model building: Ordinary Logistic Regression

**8-1:** Instantiate Logistic Regression from sklearn with default parameters.

In [36]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()

**8-2:** Fit into train data

In [37]:
log_reg.fit(X_train_tfidf,y_train)

LogisticRegression()

**8-3:** Make predictions for the train and test set

In [38]:
y_pred_class_train=log_reg.predict(X_train_tfidf)
y_pred_class_test=log_reg.predict(X_test_tfidf)

**Task 9:** Model evaluation -  (1) accuracy     (2) Recall     (3) f1_score 

**9-1** Report accuracy on the train data set

In [39]:
from sklearn.metrics import accuracy_score
accuracy_score_train=accuracy_score(y_train,y_pred_class_train)
print('Using Logistic Regression(with default parameters), train set accuracy is {}'.format(accuracy_score_train))

Using Logistic Regression(with default parameters), train set accuracy is 0.9523250811529587


**9-2:** Report the recall on the train set : decent, high or low

In [40]:
from sklearn.metrics import recall_score
recall_score(y_train,y_pred_class_train, average='macro')

0.6639132382051433

**9-3:** Get the f1 score on the train set

In [41]:
from sklearn.metrics import f1_score
lr_f1_score_train=f1_score(y_train,y_pred_class_train, average='macro')
lr_f1_score_train

0.7327949415836879

In [42]:
from sklearn.metrics import confusion_matrix
cm_train=confusion_matrix(y_train,y_pred_class_train)
print(cm_train)

[[23763    20]
 [ 1199   587]]


**Comments:** f1 score of the model is 73%. There are 20 False Positives. Meaning Ordinary Logistic Reg. model has classified 

(i) False Positive: 20 tweets as non-hate, but which were actually hate tweets.
(ii) False Negative: 1199 tweets as hate and 



**Task 10:** Looks like you need to adjust the class imbalance, as the model seems to focus on the 0s.

In [43]:
weighted_log_reg=LogisticRegression(random_state=42, class_weight='balanced')

**Task 11:** Train again with adjustment and evaluate

**11-1:** Train the model on the train set

In [44]:
weighted_log_reg.fit(X_train_tfidf,y_train)
weighted_y_pred_train=weighted_log_reg.predict(X_train_tfidf)

**11-2:** Evaluate the predictions on the train set: accuracy, recall, and f_1 score.

**11-2-1** Accuracy Score

In [45]:
wlr_score_train=accuracy_score(y_train, weighted_y_pred_train)

In [46]:
print('Accuracy of the wighted Logistic Regression is {}'.format(wlr_score_train))
print('Using Logistic Regression(with default parameters), train set accuracy is {}'.format(accuracy_score_train))
print('Accuracy of the model increased by {}'.format(wlr_score_train-accuracy_score_train))

Accuracy of the wighted Logistic Regression is 0.9658962024326333
Using Logistic Regression(with default parameters), train set accuracy is 0.9523250811529587
Accuracy of the model increased by 0.013571121279674636


**11-2-2** Recall 

In [47]:
wlr_recall_score_train=recall_score(y_train,weighted_y_pred_train,average='macro')

In [48]:
wlr_recall_score_train

0.9777836008753842

In [49]:
print('Using weighted logistic regression Recall is 98%')
print('Using Logistic Regression (Default parameters), recall value is 66%')
print('Recall value is increased by 32%')

Using weighted logistic regression Recall is 98%
Using Logistic Regression (Default parameters), recall value is 66%
Recall value is increased by 32%


**11-2-3** f1_score

In [50]:
from sklearn.metrics import f1_score
wlr_f1_score_train=f1_score(y_train,weighted_y_pred_train)
wlr_f1_score_train

0.8024467603081107

In [51]:
print('Using Logistic Regression (with default parameters), f1_score is {}'.format(round(lr_f1_score_train*100),2))
print('Using Weighted Logistic Regression, fi_score is {}'.format(round(wlr_f1_score_train*100),2))
print('There is a increase in accuracy of 7%')

Using Logistic Regression (with default parameters), f1_score is 73
Using Weighted Logistic Regression, fi_score is 80
There is a increase in accuracy of 7%


**Task 12** Regularization and hyperparameter tuning

**12-1** Import GridSearch and StratifiedKFold because of class imbalance.

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import RidgeClassifier

**12-2** Provide the parameter grid to choose for ‘C’ and ‘penalty’ parameters

In [53]:
grid={"C": [1, 2], "penalty":["l2"]}

In [54]:
lr=LogisticRegression(class_weight='balanced')
lr_cv=GridSearchCV(lr,grid)
lr_cv.fit(X_train_tfidf,y_train)
lr_cv_y_pred_train=lr_cv.predict(X_train_tfidf)

In [55]:
lr_cv_accuracy_score=accuracy_score(y_train,lr_cv_y_pred_train)
lr_cv_accuracy_score

0.9765340842426375

In [56]:
wlr_cv_train=recall_score(y_train, lr_cv_y_pred_train, average='macro')
wlr_cv_train

0.9860912890106275

In [57]:
from sklearn.metrics import f1_score
wlr_cv_f1_score=f1_score(y_train,lr_cv_y_pred_train, average='macro')
wlr_cv_f1_score

0.9215330297999154

In [58]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, lr_cv_y_pred_train))

[[23188   595]
 [    5  1781]]


In [59]:
print(confusion_matrix(y_train, weighted_y_pred_train))

[[22926   857]
 [   15  1771]]


**Comments:** By applying GridSearchCV (i) False positives are reduced to 595 from 857 and (ii) False Negatives reduced to 5 from 15

**Task 13:** Find the parameters with the best recall in cross-validation

**13-1** Choose recall as metrics for scoring

In [60]:
lr_kf=GridSearchCV(lr,grid)

**13-2** Choose stratified 4 fold cross validation scheme 

In [61]:
from numpy import array
from sklearn.model_selection import KFold
kf=KFold(n_splits=4)
kf.get_n_splits(twitter_data)

4

**13-3** Fit into the train set

In [71]:
count = 0
for train_index, test_index in kf.split(twitter_data):
  KF_X_train, KF_X_test, KF_y_train, KF_y_test = twitter_data.tweet[train_index], twitter_data.tweet[test_index],\
  twitter_data.label[train_index],twitter_data.label[test_index]
  KF_X_train_tfidf = tfidf_vector.transform(KF_X_train)
  KF_X_test_tfidf = tfidf_vector.transform(KF_X_test)
  count=count+1
  lr_kf.fit(KF_X_train_tfidf,KF_y_train)
  KF_cv_y_pred_class_train = lr_kf.predict(KF_X_train_tfidf)
  KF_recall_metric = recall_score(KF_y_train, KF_cv_y_pred_class_train, average = "macro")
  print("Recall score for fold" + str(count) + " is " + str(KF_recall_metric))

Recall score for fold1 is 0.9852848594166654
Recall score for fold2 is 0.9848743871347061
Recall score for fold3 is 0.9839939376073416
Recall score for fold4 is 0.9851718489670318


**Task 14:** 14. What are the best parameters?

In [73]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",lr_kf.best_estimator_)
print("\n The best score across ALL searched params:\n",lr_kf.best_score_)
print("\n The best parameters across ALL searched params:\n",lr_kf.best_params_) 

 Results from Grid Search 

 The best estimator across ALL searched params:
 LogisticRegression(C=2, class_weight='balanced')

 The best score across ALL searched params:
 0.9460202903960155

 The best parameters across ALL searched params:
 {'C': 2, 'penalty': 'l2'}


**Task 15** Predict and evaluate using the best estimator

**15-1** Use the best estimator from the grid search to make predictions on the test set

In [75]:
logreg_be=LogisticRegression(C=2, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

logreg_be_cv=GridSearchCV(logreg_be,grid)
logreg_be_cv.fit(X_test_tfidf,y_test)


be_cv_y_pred_class_test = logreg_be_cv.predict(X_test_tfidf)

**15-2:** What is the recall on the test set for the toxic comments.

In [76]:
be_recall = recall_score(y_test, be_cv_y_pred_class_test, average = "macro")
be_recall

0.9872814774429759

**15-3:** What is the f_1 score?

In [77]:
from sklearn.metrics import f1_score
be_f1_score = f1_score(y_test, be_cv_y_pred_class_test, average = "macro")
be_f1_score

0.9278249801638436

In [78]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, be_cv_y_pred_class_test))

[[5799  138]
 [   1  455]]


**Comments:** Applying best parameters on the test data, we get f1 score of 0.92