In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
train = pd.read_csv('train_tweets.csv')
test = pd.read_csv('test_tweets.csv')

In [3]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [5]:
# combine train and test data
combi = train.append(test, ignore_index = True)

In [6]:
combi_1 = combi.copy()

In [7]:
combi.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


In [8]:
combi_1.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


In [9]:
# write function for removing @user
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt

In [10]:
combi_1['tidy_tweet'] = np.vectorize(remove_pattern)(combi_1['tweet'], '@[\w]*')

In [11]:
combi_1.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation,factsguide: society now #motivation


In [12]:
# remove special characters, numbers, punctuations
combi_1['tidy_tweet'] = combi_1['tidy_tweet'].str.replace('[^a-zA-Z#]+',' ')

In [13]:
combi_1.head(10)

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so self...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause the...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur
4,5,0.0,factsguide: society now #motivation,factsguide society now #motivation
5,6,0.0,[2/2] huge fan fare and big talking before the...,huge fan fare and big talking before they lea...
6,7,0.0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0.0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0.0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land #allin #cavs #champions #...
9,10,0.0,@user @user welcome here ! i'm it's so #gr...,welcome here i m it s so #gr


In [14]:
# remove short words
combi_1['tidy_tweet'] = combi_1['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))


In [15]:
combi_1.head(10)

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause they offer wheelchai...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation
5,6,0.0,[2/2] huge fan fare and big talking before the...,huge fare talking before they leave chaos disp...
6,7,0.0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0.0,the next school year is the year for exams.ð...,next school year year exams think about that #...
8,9,0.0,we won!!! love the land!!! #allin #cavs #champ...,love land #allin #cavs #champions #cleveland #...
9,10,0.0,@user @user welcome here ! i'm it's so #gr...,welcome here


In [16]:
# create new variable tokenized tweet 
tokenized_tweet = combi_1['tidy_tweet'].apply(lambda x: x.split())

In [17]:
tokenized_tweet

0        [when, father, dysfunctional, selfish, drags, ...
1        [thanks, #lyft, credit, cause, they, offer, wh...
2                                  [bihday, your, majesty]
3                         [#model, love, take, with, time]
4                       [factsguide, society, #motivation]
                               ...                        
49154    [thought, factory, left, right, polarisation, ...
49155    [feeling, like, mermaid, #hairflip, #neverread...
49156    [#hillary, #campaigned, today, #ohio, used, wo...
49157    [happy, work, conference, right, mindset, lead...
49158    [song, glad, free, download, #shoegaze, #newmu...
Name: tidy_tweet, Length: 49159, dtype: object

In [18]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

# apply stemmer for tokenized_tweet
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

In [19]:
tokenized_tweet

0        [when, father, dysfunct, selfish, drag, kid, i...
1        [thank, #lyft, credit, caus, they, offer, whee...
2                                  [bihday, your, majesti]
3                         [#model, love, take, with, time]
4                             [factsguid, societi, #motiv]
                               ...                        
49154    [thought, factori, left, right, polaris, #trum...
49155    [feel, like, mermaid, #hairflip, #neverreadi, ...
49156    [#hillari, #campaign, today, #ohio, use, word,...
49157    [happi, work, confer, right, mindset, lead, cu...
49158    [song, glad, free, download, #shoegaz, #newmus...
Name: tidy_tweet, Length: 49159, dtype: object

In [20]:
# join tokens into one sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
# change combi['tidy_tweet'] to tokenized_tweet

In [21]:
combi_1['tidy_tweet'] = tokenized_tweet

In [22]:
combi_1.head(15)

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father dysfunct selfish drag kid into dys...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit caus they offer wheelchair ...
2,3,0.0,bihday your majesty,bihday your majesti
3,4,0.0,#model i love u take with u all the time in ...,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguid societi #motiv
5,6,0.0,[2/2] huge fan fare and big talking before the...,huge fare talk befor they leav chao disput whe...
6,7,0.0,@user camping tomorrow @user @user @user @use...,camp tomorrow danni
7,8,0.0,the next school year is the year for exams.ð...,next school year year exam think about that #s...
8,9,0.0,we won!!! love the land!!! #allin #cavs #champ...,love land #allin #cav #champion #cleveland #cl...
9,10,0.0,@user @user welcome here ! i'm it's so #gr...,welcom here


# part 3

#Understanding the common words used in the tweets: WordCloud

In [23]:
# create text from all tweets
all_words = ' '.join([text for text in combi_1['tidy_tweet']])

from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()


ModuleNotFoundError: No module named 'wordcloud'

#Words in non racist/sexist tweets

In [None]:
# create text from just normal tweets
normal_words = ' '.join([text for text in combi_1['tidy_tweet'][combi_1['label'] == 0]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

#Racist/Sexist Tweets

In [None]:
# create text from just negative tweets

negative_words = ' '.join([text for text in combi_1['tidy_tweet'][combi_1['label'] == 1]])

wordcloud = WordCloud(width=800, height=500,
random_state=21, max_font_size=110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()


In [None]:
# function to collect hashtags
def hashtag_extract(x):
    hashtags = []
    for i in x:
        ht = re.findall(r'#(\w+)', i)
        hashtags.append(ht)
    return hashtags

In [None]:
# extracting hashtags from non racist/sexist tweets
HT_regular = hashtag_extract(combi_1['tidy_tweet'][combi_1['label'] == 0])

# extracting hashtags from racist/sexist tweets
HT_negative = hashtag_extract(combi_1['tidy_tweet'][combi_1['label'] == 1])


# unnesting list
HT_regular = sum(HT_regular, [])
HT_negative = sum(HT_negative, [])

In [None]:
HT_regular

In [None]:
HT_negative

#Non-Racist/Sexist Tweets

In [None]:
# making frequency distribution top 10 normal hashtags
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count' : list(a.values())})

d = d.nlargest(columns = 'Count', n = 10)

plt.figure(figsize = (16,5))
ax = sns.barplot(data =d, x = 'Hashtag', y = 'Count')
plt.show()

In [None]:
d

#Racist/Sexist Tweets

In [None]:
# making frequency distribution top 10 negative hashtags

a = nltk.FreqDist(HT_negative)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count' : list(a.values())})

d = d.nlargest(columns = 'Count', n = 10)

plt.figure(figsize = (16,5))
ax = sns.barplot(data =d, x = 'Hashtag', y = 'Count')
plt.show()

# Extracting Features from Cleaned Tweets


#Bag-of-Words Features¶

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

# bag-of-words feature matrix

bow = bow_vectorizer.fit_transform(combi_1['tidy_tweet'])

In [25]:
len(bow_vectorizer.get_feature_names())

1000

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(combi_1['tidy_tweet'])

# Model Building: Sentiment Analysis

In [27]:
train.shape

(31962, 3)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# make train and test bow
train_bow = bow[:31962,:]
test_bow = bow[31962:,:]

# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'], test_size = 0.3)
# initialize Logistic Regression
lr = LogisticRegression()
# fit the model
lr.fit(xtrain_bow, ytrain)
# predicting on the validation set
prediction = lr.predict_proba(xvalid_bow)
# if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction[:,1] >= 0.3
# convert to int values
prediction_int = prediction_int.astype(np.int)
# calculating f1 score
f1_score(yvalid, prediction_int) 

0.5543859649122808

In [29]:
# writing data to a CSV file

test_pred = lr.predict_proba(test_bow)
# if prediction is greater than or equal to 0.3 than 1 else 0
test_pred_int = test_pred[:,1] >= 0.3
# convert to int values
test_pred_int = test_pred_int.astype(np.int)

test['label'] = test_pred_int 
submission = test[['id', 'label']]
submission.to_csv('lr_bow_sub.csv', index=False)

#Building model using TF-IDF features

In [30]:
# make train and test bow
train_bow = tfidf[:31962,:]
test_bow = tfidf[31962:,:]

# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'], test_size = 0.3)
# initialize Logistic Regression
lr = LogisticRegression()
# fit the model
lr.fit(xtrain_bow, ytrain)
# predicting on the validation set
prediction = lr.predict_proba(xvalid_bow)
# if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction[:,1] >= 0.3
# convert to int values
prediction_int = prediction_int.astype(np.int)
# calculating f1 score
f1_score(yvalid, prediction_int)

0.5865051903114187

In [31]:
# writing data to a CSV file

test_pred = lr.predict_proba(test_bow)
# if prediction is greater than or equal to 0.3 than 1 else 0
test_pred_int = test_pred[:,1] >= 0.3
# convert to int values
test_pred_int = test_pred_int.astype(np.int)

test['label'] = test_pred_int 
submission = test[['id', 'label']]
submission.to_csv('lr_tfidf_sub.csv', index = False)