# Identify the Sentiments  
https://datahack.analyticsvidhya.com/contest/linguipedia-codefest-natural-language-processing-1/  
**Aleksey Shipitsyn**    
**2019-08-02**

## Read and explore data

In [3]:
import numpy as np
import pandas as pd


In [4]:
# read train data
df_train = pd.read_csv('./Documents/Competitions/Identify Sentiments/train_2kmZucJ.xls') 

print('Missing values:')
print(df_train.isnull().sum(),'\n')
print('shape:', df_train.shape, '\n')

df_train.head()



Missing values:
id       0
label    0
tweet    0
dtype: int64 

shape: (7920, 3) 



Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [5]:
# check the balance of target sentiment 
pd.crosstab(index=df_train.label, columns=['label'], normalize=True).round(2)



col_0,label
label,Unnamed: 1_level_1
0,0.74
1,0.26


In [6]:
# read test data
df_test = pd.read_csv('./Documents/Competitions/Identify Sentiments/test_oJQbWVk.xls')

print('Missing values:')
print(df_test.isnull().sum(),'\n')
print('shape:', df_test.shape, '\n')

df_test.head()


Missing values:
id       0
tweet    0
dtype: int64 

shape: (1953, 2) 



Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [7]:
# read submission data
df_submission = pd.read_csv('./Documents/Competitions/Identify Sentiments/sample_submission_LnhVWA4.xls')

# check if submission hash corresponds to test set
print('Submission file matches test set file:', all(df_test.id == df_submission.id))

df_submission.head()


Submission file matches test set file: True


Unnamed: 0,id,label
0,7921,0
1,7922,0
2,7923,0
3,7924,0
4,7925,0


## Data preprocessing  

### Text preprocessing  

- Add more features 
- Remove everything except of english letters
- Stopwords  
- Stemming 
- N-grams  
- TFIDF  


In [8]:
# Add more features

train = df_train
test = df_test


# number of words
train['words_count'] = train['tweet'].apply(lambda x: len(str(x).split(' ')))
test['words_count'] = test['tweet'].apply(lambda x: len(str(x).split(' ')))


# number of characters
train['char_count'] = train['tweet'].str.len()
test['char_count'] = test['tweet'].str.len()


# average word length
def avg_word_length(sentence):
    words = sentence.split()
    return sum( len(word) for word in words) / len(words) 

train['avg_word_length'] = train['tweet'].apply(lambda x: avg_word_length(x))
test['avg_word_length'] = test['tweet'].apply(lambda x: avg_word_length(x))


# number of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')

train['stopword_count'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
test['stopword_count'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))


# number of special characters
train['hashtags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
test['hashtags'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))


# number of numerics
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
test['numerics'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))


# number od uppercase words
train['uppercase'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
test['uppercase'] = test['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

print(train[:5], '\n')
print(test[:5])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alekseyshipitsyn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   id  label                                              tweet  words_count  \
0   1      0  #fingerprint #Pregnancy Test https://goo.gl/h1...           13   
1   2      0  Finally a transparant silicon case ^^ Thanks t...           17   
2   3      0  We love this! Would you go? #talk #makememorie...           15   
3   4      0  I'm wired I know I'm George I was made that wa...           17   
4   5      1  What amazing service! Apple won't even talk to...           23   

   char_count  avg_word_length  stopword_count  hashtags  numerics  uppercase  
0         128         8.923077               0        11         0          0  
1         131         6.764706               3         5         0          1  
2         123         7.266667               1         8         0          0  
3         112         5.647059               2         4         0          2  
4         124         4.434783               9         0         0          2   

     id                              

In [9]:
sum(train['numerics'] > 0), sum(train['uppercase'] > 0)

(837, 3146)

In [10]:
# Remove everything but leave english letters and spaces in lower case

import re
regexp = re.compile('[A-Za-z\']+')

train['tweet'] = train['tweet'].apply(lambda x: ' '.join(regexp.findall(x)).lower())
test['tweet'] = test['tweet'].apply(lambda x: ' '.join(regexp.findall(x)).lower())

print(train['tweet'][:10], '\n')
print(test['tweet'][:10])



0    fingerprint pregnancy test https goo gl h mfqv...
1    finally a transparant silicon case thanks to m...
2    we love this would you go talk makememories un...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service apple won't even talk to ...
5    iphone software update fucked up my phone big ...
6    happy for us instapic instadaily us sony xperi...
7    new type c charger cable uk http www ebay co u...
8    bout to go shopping again listening to music i...
9    photo fun selfie pool water sony camera picoft...
Name: tweet, dtype: object 

0    i hate the new iphone upgrade won't let me dow...
1    currently shitting my fucking pants apple imac...
2    i'd like to puts some cd roms on my ipad is th...
3    my ipod is officially dead i lost all my pictu...
4    been fighting itunes all night i only want the...
5    repost getbakednfried with repostapp to announ...
6    this new apple software update is really doing...
7    baby iphone iphone s gold new a

In [9]:
# Error correction - no big sense here

#from textblob import TextBlob
#train['tweet'] = train['tweet'].apply(lambda x: str(TextBlob(x).correct()))
#test['tweet'] = test['tweet'].apply(lambda x: str(TextBlob(x).correct()))



In [11]:
# Removing stop words

train['tweet'] = train['tweet'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
test['tweet'] = test['tweet'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

print(train['tweet'][:10], '\n')
print(test['tweet'][:10])



0    fingerprint pregnancy test https goo gl h mfqv...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    i'm wired know i'm george made way iphone cute...
4    amazing service apple even talk question unles...
5    iphone software update fucked phone big time s...
6    happy us instapic instadaily us sony xperia xp...
7    new type c charger cable uk http www ebay co u...
8    bout go shopping listening music iphone justme...
9    photo fun selfie pool water sony camera picoft...
Name: tweet, dtype: object 

0    hate new iphone upgrade let download apps ugh ...
1    currently shitting fucking pants apple imac ca...
2    i'd like puts cd roms ipad possible ' yes bloc...
3    ipod officially dead lost pictures videos sos ...
4                fighting itunes night want music paid
5    repost getbakednfried repostapp announce apple...
6    new apple software update really things phone ...
7    baby iphone iphone gold new app

In [12]:
# words frequencies
freq = pd.Series(" ".join(train['tweet']).split()).value_counts() 

# 10 most frequent words
freq_words = list(freq.index[:10])
print('Removed words: {}\n'.format(freq_words))

train['tweet'] = train['tweet'].apply(lambda x: ' '.join([x for x in x.split() if x not in freq_words]))
test['tweet'] = test['tweet'].apply(lambda x: ' '.join([x for x in x.split() if x not in freq_words]))

# remove most frequent words from frequency list 
freq = freq[10:]


print(train['tweet'][:10], '\n')
print(test['tweet'][:10])


Removed words: ['iphone', 'http', 'com', 'apple', 'p', 'instagram', 'samsung', 'twitter', 'new', 'https']

0    fingerprint pregnancy test goo gl h mfqv andro...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax s...
3    i'm wired know i'm george made way cute davent...
4    amazing service even talk question unless pay ...
5    software update fucked phone big time stupid i...
6    happy us instapic instadaily us sony xperia xp...
7    type c charger cable uk www ebay co uk itm bay...
8    bout go shopping listening music justme music ...
9    photo fun selfie pool water sony camera picoft...
Name: tweet, dtype: object 

0             hate upgrade let download apps ugh sucks
1    currently shitting fucking pants imac cashmone...
2    i'd like puts cd roms ipad possible ' yes bloc...
3    ipod officially dead lost pictures videos sos ...
4                fighting itunes night want music paid
5    repost getbakednfried repostapp an

In [13]:
# Stemming

from nltk.stem import PorterStemmer
st = PorterStemmer()

train['tweet'] = train['tweet'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test['tweet'] = test['tweet'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

print(train['tweet'][:10], '\n')
print(test['tweet'][:10])


0    fingerprint pregnanc test goo gl h mfqv androi...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax sma...
3    i'm wire know i'm georg made way cute daventri...
4    amaz servic even talk question unless pay stup...
5       softwar updat fuck phone big time stupid iphon
6    happi us instap instadaili us soni xperia xper...
7    type c charger cabl uk www ebay co uk itm bay ...
8    bout go shop listen music justm music likeforl...
9    photo fun selfi pool water soni camera picofth...
Name: tweet, dtype: object 

0                hate upgrad let download app ugh suck
1    current shit fuck pant imac cashmoney raddest ...
2    i'd like put cd rom ipad possibl ' ye block sc...
3    ipod offici dead lost pictur video so concert ...
4                     fight itun night want music paid
5    repost getbakednfri repostapp announc bourbon ...
6           softwar updat realli thing phone bad thing
7    babi gold appleisbest gb geg nn

In [14]:
# TextBlob sentiment feature 

from textblob import TextBlob

train['TextBlob_sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0])
test['TextBlob_sentiment'] = test['tweet'].apply(lambda x: TextBlob(x).sentiment[0])

print(train[:5], '\n')
print(test[:5])


   id  label                                              tweet  words_count  \
0   1      0  fingerprint pregnanc test goo gl h mfqv androi...           13   
1   2      0  final transpar silicon case thank uncl yay son...           17   
2   3      0  love would go talk makememori unplug relax sma...           15   
3   4      0  i'm wire know i'm georg made way cute daventri...           17   
4   5      1  amaz servic even talk question unless pay stup...           23   

   char_count  avg_word_length  stopword_count  hashtags  numerics  uppercase  \
0         128         8.923077               0        11         0          0   
1         131         6.764706               3         5         0          1   
2         123         7.266667               1         8         0          0   
3         112         5.647059               2         4         0          2   
4         124         4.434783               9         0         0          2   

   TextBlob_sentiment  
0       

In [34]:
# Split training data to training and validation subsets

from sklearn.model_selection import train_test_split

X = train[[col for col in train.columns if (col != 'id') & (col != 'label')]]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape


((5306, 9), (5306,), (2614, 9), (2614,))

In [42]:
# TFIDF with N-grams

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(ngram_range=(1,3), lowercase=False)

X_train_txt_features = vect.fit_transform(X_train['tweet'])
X_test_txt_features = vect.transform(X_test['tweet'])

X_train_txt_features.shape, X_test_txt_features.shape


((5306, 105665), (2614, 105665))

## Modeling

In [69]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1, class_weight='balanced')

model.fit(X_train_txt_features, y_train)

model_sentiment = model.predict(X_train_txt_features)

    

In [70]:
# F1 score

from sklearn.metrics import f1_score

f1_train = f1_score(y_true=y_train, y_pred=model.predict(X_train_txt_features))
f1_validation = f1_score(y_true=y_test, y_pred=model.predict(X_test_txt_features))


print('Score for training set: {}'.format(f1_train))
print('Score for validation set: {}'.format(f1_validation))
 

Score for training set: 0.933048433048433
Score for validation set: 0.8028442146089204


In [71]:
# retrain the model on all training data

X_train = X
y_train = y
X_test = test[[col for col in test.columns if (col != 'id')]]

X_train_txt_features = vect.fit_transform(X_train['tweet'])
X_test_txt_features = vect.transform(X_test['tweet'])

print('Data shape:', X_train_txt_features.shape, X_test_txt_features.shape)

model.fit(X_train_txt_features, y_train)

f1_train = f1_score(y_true=y_train, y_pred=model.predict(X_train_txt_features))
print('Score for training set: {}'.format(f1_train))

train_sentiment = model.predict(X_train_txt_features)
test_sentiment = model.predict(X_test_txt_features)


Data shape: (7920, 150631) (1953, 150631)
Score for training set: 0.9249193919852602




In [72]:
# Stack sentiment predictions with other features
X_train = X_train.drop('tweet', axis='columns')
X_test = X_test.drop('tweet', axis='columns')

X_train['model_sentiment'] = train_sentiment
X_test['model_sentiment'] = test_sentiment

X_train.head()

Unnamed: 0,words_count,char_count,avg_word_length,stopword_count,hashtags,numerics,uppercase,TextBlob_sentiment,model_sentiment
0,13,128,8.923077,0,11,0,0,0.5,0
1,17,131,6.764706,3,5,0,1,0.0,0
2,15,123,7.266667,1,8,0,0,0.5,0
3,17,112,5.647059,2,4,0,2,0.5,0
4,23,124,4.434783,9,0,0,2,-0.8,1


In [74]:
# Stack model
stack_model = LogisticRegression(C=1, class_weight='balanced')

stack_model.fit(X_train, y_train)

f1_stack = f1_score(y_true=y_train, y_pred=stack_model.predict(X_train))
print('Score for training set: {}'.format(f1_stack))

prediction = stack_model.predict(X_test)


Score for training set: 0.9249193919852602




## Submission

In [75]:
df_submission['label'] = prediction.astype(int)
print(df_submission.head())

df_submission.to_csv('./Documents/Competitions/Identify Sentiments/submission_1.csv', index=False)



     id  label
0  7921      1
1  7922      1
2  7923      1
3  7924      1
4  7925      1


In [76]:
# rank at submission
rank = 225
participants = 2424
print('Acheaved rank is {} from {} participants, or top {} %'.format(
            rank, participants, round(100*rank/participants, 2)))

Acheaved rank is 225 from 2424 participants, or top 9.28 %
