# Twitter Sentiment Analysis
https://datahack.analyticsvidhya.com/contest/practice-problem-twitter-sentiment-analysis/
**Aleksey Shipitsyn**    
**2019-08-01**

## Read and explore data

In [1]:
import numpy as np
import pandas as pd


In [2]:
# read train data
df_train = pd.read_csv('./Documents/Competitions/Twitter Sentiment/train_E6oV3lV.csv') 

print('Missing values:')
print(df_train.isnull().sum(),'\n')
print('shape:', df_train.shape, '\n')

df_train.head()



Missing values:
id       0
label    0
tweet    0
dtype: int64 

shape: (31962, 3) 



Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
# check the balance of target sentiment 
pd.crosstab(index=df_train.label, columns=['label'], normalize=True).round(2)



col_0,label
label,Unnamed: 1_level_1
0,0.93
1,0.07


In [4]:
# read test data
df_test = pd.read_csv('./Documents/Competitions/Twitter Sentiment/test_tweets_anuFYb8.csv')

print('Missing values:')
print(df_test.isnull().sum(),'\n')
print('shape:', df_test.shape, '\n')

df_test.head()


Missing values:
id       0
tweet    0
dtype: int64 

shape: (17197, 2) 



Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [5]:
# read submission data
df_submission = pd.read_csv('./Documents/Competitions/Twitter Sentiment/sample_submission_gfvA5FD.xls')

# check if submission hash corresponds to test set
print('Submission file matches test set file:', all(df_test.id == df_submission.id))

df_submission.head()


Submission file matches test set file: True


Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0


## Data preprocessing  

### Text preprocessing  

- Add more features 
- Remove everything except of english letters
- Stopwords  
- Stemming 
- N-grams  
- TFIDF  


In [6]:
# Add more features

train = df_train
test = df_test


# number of words
train['words_count'] = train['tweet'].apply(lambda x: len(str(x).split(' ')))
test['words_count'] = test['tweet'].apply(lambda x: len(str(x).split(' ')))


# number of characters
train['char_count'] = train['tweet'].str.len()
test['char_count'] = test['tweet'].str.len()


# average word length
def avg_word_length(sentence):
    words = sentence.split()
    return sum( len(word) for word in words) / len(words) 

train['avg_word_length'] = train['tweet'].apply(lambda x: avg_word_length(x))
test['avg_word_length'] = test['tweet'].apply(lambda x: avg_word_length(x))


# number of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')

train['stopword_count'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
test['stopword_count'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))


# number of special characters
train['hashtags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
test['hashtags'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))


# number of numerics
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
test['numerics'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))


# number od uppercase words
train['uppercase'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
test['uppercase'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

print(train[:5], '\n')
print(test[:5])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alekseyshipitsyn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   id  label                                              tweet  words_count  \
0   1      0   @user when a father is dysfunctional and is s...           21   
1   2      0  @user @user thanks for #lyft credit i can't us...           22   
2   3      0                                bihday your majesty            5   
3   4      0  #model   i love u take with u all the time in ...           17   
4   5      0             factsguide: society now    #motivation            8   

   char_count  avg_word_length  stopword_count  hashtags  numerics  uppercase  
0         102         4.555556              10         1         0          0  
1         122         5.315789               5         3         0          0  
2          21         5.666667               1         0         0          0  
3          86         4.928571               5         1         0          0  
4          39         8.000000               1         1         0          0   

      id                             

In [7]:
sum(train['numerics'] > 0), sum(train['uppercase'] > 0)

(2223, 75)

In [8]:
# Remove everything but leave english letters and spaces in lower case

import re
regexp = re.compile('[A-Za-z\']+')

train['tweet'] = train['tweet'].apply(lambda x: ' '.join(regexp.findall(x)).lower())
test['tweet'] = test['tweet'].apply(lambda x: ' '.join(regexp.findall(x)).lower())

print(train['tweet'][:10], '\n')
print(test['tweet'][:10])



0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i can't use c...
2                                  bihday your majesty
3        model i love u take with u all the time in ur
4                    factsguide society now motivation
5    huge fan fare and big talking before they leav...
6    user camping tomorrow user user user user user...
7    the next school year is the year for exams can...
8    we won love the land allin cavs champions clev...
9                user user welcome here i'm it's so gr
Name: tweet, dtype: object 

0    studiolife aislife requires passion dedication...
1    user white supremacists want everyone to see t...
2    safe ways to heal your acne altwaystoheal heal...
3    is the hp and the cursed child book up for res...
4    rd bihday to my amazing hilarious nephew eli a...
5                                 choose to be momtips
6    something inside me dies eyes ness smokeyeyes ...
7       finished tattoo inked ink lo

In [9]:
# Error correction - no big sense here
#from textblob import TextBlob

#train['tweet'] = train['tweet'].apply(lambda x: str(TextBlob(x).correct()))
#test['tweet'] = test['tweet'].apply(lambda x: str(TextBlob(x).correct()))



In [10]:
# Removing stop words

train['tweet'] = train['tweet'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
test['tweet'] = test['tweet'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

print(train['tweet'][:10], '\n')
print(test['tweet'][:10])



0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit can't use cause o...
2                                       bihday majesty
3                          model love u take u time ur
4                        factsguide society motivation
5    huge fan fare big talking leave chaos pay disp...
6    user camping tomorrow user user user user user...
7    next school year year exams can't think school...
8    love land allin cavs champions cleveland cleve...
9                             user user welcome i'm gr
Name: tweet, dtype: object 

0    studiolife aislife requires passion dedication...
1    user white supremacists want everyone see new ...
2    safe ways heal acne altwaystoheal healthy healing
3    hp cursed child book reservations already yes ...
4    rd bihday amazing hilarious nephew eli ahmir u...
5                                       choose momtips
6    something inside dies eyes ness smokeyeyes tir...
7       finished tattoo inked ink lo

In [11]:
# words frequencies
freq = pd.Series(" ".join(train['tweet']).split()).value_counts() 

# 10 most frequent words
freq_words = list(freq.index[:10])
print('Removed words: {}\n'.format(freq_words))

train['tweet'] = train['tweet'].apply(lambda x: ' '.join([x for x in x.split() if x not in freq_words]))
test['tweet'] = test['tweet'].apply(lambda x: ' '.join([x for x in x.split() if x not in freq_words]))

# remove most frequent words from frequency list 
freq = freq[10:]


print(train['tweet'][:10], '\n')
print(test['tweet'][:10])


Removed words: ['user', 'love', 'day', 'amp', 'happy', 'u', 'like', 'life', 'time', 'today']

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit can't use cause offer wheel...
2                                       bihday majesty
3                                        model take ur
4                        factsguide society motivation
5    huge fan fare big talking leave chaos pay disp...
6                               camping tomorrow danny
7    next school year year exams can't think school...
8    land allin cavs champions cleveland clevelandc...
9                                       welcome i'm gr
Name: tweet, dtype: object 

0    studiolife aislife requires passion dedication...
1    white supremacists want everyone see new birds...
2    safe ways heal acne altwaystoheal healthy healing
3    hp cursed child book reservations already yes ...
4    rd bihday amazing hilarious nephew eli ahmir u...
5                                       choose momti

In [12]:
# Stemming

from nltk.stem import PorterStemmer
st = PorterStemmer()

train['tweet'] = train['tweet'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test['tweet'] = test['tweet'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

print(train['tweet'][:10], '\n')
print(test['tweet'][:10])


0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit can't use caus offer wheelch...
2                                       bihday majesti
3                                        model take ur
4                              factsguid societi motiv
5    huge fan fare big talk leav chao pay disput ge...
6                                  camp tomorrow danni
7    next school year year exam can't think school ...
8    land allin cav champion cleveland clevelandcavali
9                                        welcom i'm gr
Name: tweet, dtype: object 

0    studiolif aislif requir passion dedic willpow ...
1     white supremacist want everyon see new bird movi
2            safe way heal acn altwaystoh healthi heal
3    hp curs child book reserv alreadi ye harrypott...
4    rd bihday amaz hilari nephew eli ahmir uncl da...
5                                         choos momtip
6    someth insid die eye ness smokeyey tire lone s...
7             finish tattoo ink ink 

In [13]:
# TextBlob sentiment feature 

from textblob import TextBlob

train['TextBlob_sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0])
test['TextBlob_sentiment'] = test['tweet'].apply(lambda x: TextBlob(x).sentiment[0])

print(train[:5], '\n')
print(test[:5])


   id  label                                              tweet  words_count  \
0   1      0      father dysfunct selfish drag kid dysfunct run           21   
1   2      0  thank lyft credit can't use caus offer wheelch...           22   
2   3      0                                     bihday majesti            5   
3   4      0                                      model take ur           17   
4   5      0                            factsguid societi motiv            8   

   char_count  avg_word_length  stopword_count  hashtags  numerics  uppercase  \
0         102         4.555556              10         1         0          0   
1         122         5.315789               5         3         0          0   
2          21         5.666667               1         0         0          0   
3          86         4.928571               5         1         0          0   
4          39         8.000000               1         1         0          0   

   TextBlob_sentiment  
0       

In [14]:
# Split training data to training and validation subsets

from sklearn.model_selection import train_test_split

X = train[[col for col in train.columns if (col != 'id') & (col != 'label')]]
y = train['label'].values.astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape


((21414, 9), (21414,), (10548, 9), (10548,))

In [15]:
# TFIDF with N-grams

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range=(1,1), lowercase=False)

X_train_txt_features = vect.fit_transform(X_train['tweet'])
X_test_txt_features = vect.transform(X_test['tweet'])

X_train_txt_features.shape, X_test_txt_features.shape


((21414, 24532), (10548, 24532))

## Modeling

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=0.1, class_weight='balanced')
model.fit(X_train_txt_features, y_train)

model_sentiment = model.predict(X_train_txt_features)

print('Score for training set: {}'.format(model.score(X_train_txt_features, y_train).round(3)))
print('Score for validation set: {}'.format(model.score(X_test_txt_features, y_test).round(3)))
     

Score for training set: 0.947
Score for validation set: 0.927




In [17]:
# retrain the model on all training data

X_train = X
y_train = y
X_test = test[[col for col in test.columns if (col != 'id')]]

X_train_txt_features = vect.fit_transform(X_train['tweet'])
X_test_txt_features = vect.transform(X_test['tweet'])

print('Data shape:', X_train_txt_features.shape, X_test_txt_features.shape)

model.fit(X_train_txt_features, y_train)
print('Score for training set: {}'.format(model.score(X_train_txt_features, y_train).round(3)))

train_sentiment = model.predict(X_train_txt_features)
test_sentiment = model.predict(X_test_txt_features)


Data shape: (31962, 31497) (17197, 31497)
Score for training set: 0.942




In [18]:
# Stack sentiment predictions with other features
X_train = X_train.drop('tweet', axis='columns')
X_test = X_test.drop('tweet', axis='columns')

X_train['model_sentiment'] = train_sentiment
X_test['model_sentiment'] = test_sentiment

X_train.head()

Unnamed: 0,words_count,char_count,avg_word_length,stopword_count,hashtags,numerics,uppercase,TextBlob_sentiment,model_sentiment
0,21,102,4.555556,10,1,0,0,-0.3,0
1,22,122,5.315789,5,3,0,0,0.0,0
2,5,21,5.666667,1,0,0,0,0.0,0
3,17,86,4.928571,5,1,0,0,0.0,0
4,8,39,8.0,1,1,0,0,0.0,0


In [19]:
# Stack model
stack_model = LogisticRegression(C=0.1, class_weight='balanced')

stack_model.fit(X_train, y_train)
print('Score for training set: {}'.format(stack_model.score(X_train, y_train).round(3)))

prediction = stack_model.predict(X_test)


Score for training set: 0.942


In [20]:
type(prediction)

numpy.ndarray

## Submission

In [21]:
df_submission['label'] = prediction.astype(int)
print(df_submission.head())

df_submission.to_csv('./Documents/Competitions/Twitter Sentiment/submission_1.csv', index=False)



      id  label
0  31963      0
1  31964      1
2  31965      0
3  31966      0
4  31967      0


In [24]:
# rank at submission
rank = 408
participants = 10321
print('Acheaved rank is {} from {} participants, or top {} %'.format(
            rank, participants, round(100*rank/participants, 2)))

Acheaved rank is 408 from 10321 participants, or top 3.95 %
