In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import os
print(os.listdir("input"))

['test.tsv', 'train.tsv']


In [3]:
# Loading Data
train_df = pd.read_csv("input/train.tsv",sep='\t')
test_df = pd.read_csv("input/test.tsv",sep='\t')

In [4]:
#Training Data Set
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti..."
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I..."
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla..."


In [5]:
#Testing Data Set
test_df.head()
print('Testing data set has no Label column')
print(test_df.head(10))

Testing data set has no Label column
             tweet_id                                         tweet_text
0  264238274963451904  @jjuueellzz down in the Atlantic city, ventnor...
1  218775148495515649  Musical awareness: Great Big Beautiful Tomorro...
2  258965201766998017  On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3  262926411352903682  Kapan sih lo ngebuktiin,jan ngomong doang Susa...
4  171874368908050432  Excuse the connectivity of this live stream, f...
5  256010056942903296  Show your LOVE for your local field & it might...
6  253809989599232000  Milton on Bolton Wanderers 2 v 2 Leeds United,...
7  261776619146985472  @firecore Can you tell me when an update for t...
8  264143999374356481  @Heavensbasement The Crown, Filthy McNastys, K...
9  223052929131757571  Uncover the Eternal City! Return flights to Ro...


In [6]:
# Training Data Set Information
print("Training Data Set Info - Total Rows | Total Columns | Total Null Values")
print(train_df.info())

Training Data Set Info - Total Rows | Total Columns | Total Null Values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21630 entries, 0 to 21629
Data columns (total 3 columns):
tweet_id      21630 non-null int64
sentiment     21630 non-null object
tweet_text    21630 non-null object
dtypes: int64(1), object(2)
memory usage: 507.1+ KB
None


In [7]:
# Testing Data Set Information
print("Test Data Set Info - Total Rows | Total Columns | Total Null Values")
print(test_df.info())

Test Data Set Info - Total Rows | Total Columns | Total Null Values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 2 columns):
tweet_id      5398 non-null int64
tweet_text    5398 non-null object
dtypes: int64(1), object(1)
memory usage: 84.5+ KB
None


In [8]:
# Merging both the data sets as tweets in both the data set is unstructured
combine_df = train_df.append(test_df, ignore_index = True, sort = False)
combine_df.head()

Unnamed: 0,tweet_id,sentiment,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti..."


In [9]:
# Combine (Merged) Data Set Information
print("Combine Data Set Info - Total Rows | Total Columns | Total Null Values")
print(combine_df.info())

Combine Data Set Info - Total Rows | Total Columns | Total Null Values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27028 entries, 0 to 27027
Data columns (total 3 columns):
tweet_id      27028 non-null int64
sentiment     21630 non-null object
tweet_text    27028 non-null object
dtypes: int64(1), object(2)
memory usage: 633.6+ KB
None


In [10]:
# Importing HTMLParser
from html.parser import HTMLParser
html_parser = HTMLParser()

In [11]:
# Created a new columns i.e. clean_tweet contains the same tweets but cleaned version
combine_df['clean_tweet'] = combine_df['tweet_text'].apply(lambda x: html_parser.unescape(x))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,Gas by my house hit $3.39!!!! I'm going to Cha...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...","Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...","its not that I'm a GSP fan, i just hate Nick D..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,Iranian general says Israel's Iron Dome can't ...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...","Tehran, Mon Amour: Obama Tried to Establish Ti..."
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,I sat through this whole movie just for Harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with J Davlar 11th. Main rivals are team Polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...","Talking about ACT's && SAT's, deciding where I..."
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,Why is Happy Valentines Day trending? It's o...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...","They may have a SuperBowl in Dallas, but Dalla..."


In [13]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [21]:
# remove twitter handles (@user)
combine_df['clean_tweet'] = np.vectorize(remove_pattern)(combine_df['clean_tweet'], "@[\w]*")
combine_df.head(100)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,Gas by my house hit $3.39!!!! I'm going to Cha...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...","Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...","its not that I'm a GSP fan, i just hate Nick D..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,Iranian general says Israel's Iron Dome can't ...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...","Tehran, Mon Amour: Obama Tried to Establish Ti..."
...,...,...,...,...
95,263132960507703296,neutral,and the 4th one is for Harry Styles!! <33333,and the 4th one is for Harry Styles!! <33333
96,264229531773177856,neutral,"this guy so much like bruno mars, it's crazy. ...","this guy so much like bruno mars, it's crazy. ..."
97,259022549604790273,neutral,Blog Post: MTV's Teen Mom 2 Returns for an I...,Blog Post: MTV's Teen Mom 2 Returns for an I...
98,253050608322502657,neutral,The Business Leader's Award ceremony will be h...,The Business Leader's Award ceremony will be h...


In [22]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: x.lower())
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit $3.39!!!! i'm going to cha...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...","theo walcott is still shit, watch rafa and joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...","its not that i'm a gsp fan, i just hate nick d..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel's iron dome can't ...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...","tehran, mon amour: obama tried to establish ti..."
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar 11th. main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...","talking about act's && sat's, deciding where i..."
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending? it's o...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...","they may have a superbowl in dallas, but dalla..."


In [23]:
# Apostrophe Dictionary
apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}
apostrophe_dict

{"ain't": 'am not / are not',
 "aren't": 'are not / am not',
 "can't": 'cannot',
 "can't've": 'cannot have',
 "'cause": 'because',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he had / he would',
 "he'd've": 'he would have',
 "he'll": 'he shall / he will',
 "he'll've": 'he shall have / he will have',
 "he's": 'he has / he is',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how has / how is',
 "i'd": 'I had / I would',
 "i'd've": 'I would have',
 "i'll": 'I shall / I will',
 "i'll've": 'I shall have / I will have',
 "i'm": 'I am',
 "i've": 'I have',
 "isn't": 'is not',
 "it'd": 'it had / it would',
 "it'd've": 'it would have',
 "it'll": 'it shall / it will',
 "it'll've": 'it shall have / it will have',
 "it's": 'it has / it is',
 "let's": 'l

In [24]:
def lookup_dict(text, dictionary):
    for word in text.split():
        if word.lower() in dictionary:
            if word.lower() in text.split():
                text = text.replace(word, dictionary[word.lower()])
    return text

In [25]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: lookup_dict(x,apostrophe_dict))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit $3.39!!!! I am going to ch...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...","theo walcott is still shit, watch rafa and joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...","its not that I am a gsp fan, i just hate nick ..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel's iron dome cannot...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...","tehran, mon amour: obama tried to establish ti..."
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar 11th. main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...","talking about act's && sat's, deciding where i..."
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending? it has...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...","they may have a superbowl in dallas, but dalla..."


In [26]:
def remove_site(text):
    URLless_string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    return URLless_string

In [29]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: remove_site(x))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit $3.39!!!! I am going to ch...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...","theo walcott is still shit, watch rafa and joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...","its not that I am a gsp fan, i just hate nick ..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel's iron dome cannot...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...","tehran, mon amour: obama tried to establish ti..."
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar 11th. main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...","talking about act's && sat's, deciding where i..."
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending? it has...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...","they may have a superbowl in dallas, but dalla..."


In [30]:
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

In [31]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: lookup_dict(x,short_word_dict))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit $3.39!!!! I am going to ch...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...","theo walcott is still shit, watch rafa and joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...","its not that I am a gsp fan, i just hate nick ..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel's iron dome cannot...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...","tehran, mon amour: obama tried to establish ti..."
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar 11th. main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...","talking about act's && sat's, deciding where i..."
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending? it has...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...","they may have a superbowl in dallas, but dalla..."


In [32]:
emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}
emoticon_dict

{':)': 'happy',
 ':‑)': 'happy',
 ':-]': 'happy',
 ':-3': 'happy',
 ':->': 'happy',
 '8-)': 'happy',
 ':-}': 'happy',
 ':o)': 'happy',
 ':c)': 'happy',
 ':^)': 'happy',
 '=]': 'happy',
 '=)': 'happy',
 '<3': 'happy',
 ':-(': 'sad',
 ':(': 'sad',
 ':c': 'sad',
 ':<': 'sad',
 ':[': 'sad',
 '>:[': 'sad',
 ':{': 'sad',
 '>:(': 'sad',
 ':-c': 'sad',
 ':-< ': 'sad',
 ':-[': 'sad',
 ':-||': 'sad'}

In [33]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: lookup_dict(x,emoticon_dict))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit $3.39!!!! I am going to ch...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...","theo walcott is still shit, watch rafa and joh..."
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...","its not that I am a gsp fan, i just hate nick ..."
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel's iron dome cannot...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...","tehran, mon amour: obama tried to establish ti..."
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar 11th. main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...","talking about act's && sat's, deciding where i..."
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending? it has...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...","they may have a superbowl in dallas, but dalla..."


In [34]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit 3 39 I am going to ch...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...",theo walcott is still shit watch rafa and joh...
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...",its not that I am a gsp fan i just hate nick ...
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel s iron dome cannot...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...",tehran mon amour obama tried to establish ti...
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar 11th main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...",talking about act s sat s deciding where i...
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending it has...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...",they may have a superbowl in dallas but dalla...


In [35]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit 3 39 I am going to ch...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...",theo walcott is still shit watch rafa and joh...
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...",its not that I am a gsp fan i just hate nick ...
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel s iron dome cannot...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...",tehran mon amour obama tried to establish ti...
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar 11th main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...",talking about act s sat s deciding where i...
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending it has...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...",they may have a superbowl in dallas but dalla...


In [36]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit I am going to ch...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...",theo walcott is still shit watch rafa and joh...
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...",its not that I am a gsp fan i just hate nick ...
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel s iron dome cannot...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...",tehran mon amour obama tried to establish ti...
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,i sat through this whole movie just for harry ...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with j davlar th main rivals are team polan...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...",talking about act s sat s deciding where i...
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending it has...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...",they may have a superbowl in dallas but dalla...


In [37]:
combine_df['clean_tweet'] = combine_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
combine_df.head(10)

Unnamed: 0,tweet_id,sentiment,tweet_text,clean_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I'm going to Cha...,gas by my house hit am going to chapel hill on...
1,263405084770172928,negative,"Theo Walcott is still shit, watch Rafa and Joh...",theo walcott is still shit watch rafa and john...
2,262163168678248449,negative,"its not that I'm a GSP fan, i just hate Nick D...",its not that am gsp fan just hate nick diaz ca...
3,264249301910310912,negative,Iranian general says Israel's Iron Dome can't ...,iranian general says israel iron dome cannot d...
4,262682041215234048,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti...",tehran mon amour obama tried to establish ties...
5,264229576773861376,neutral,I sat through this whole movie just for Harry ...,sat through this whole movie just for harry an...
6,264105751826538497,positive,with J Davlar 11th. Main rivals are team Polan...,with davlar th main rivals are team poland hop...
7,264094586689953794,negative,"Talking about ACT's && SAT's, deciding where I...",talking about act sat deciding where want to g...
8,212392538055778304,neutral,Why is Happy Valentines Day trending? It's o...,why is happy valentines day trending it has it...
9,254941790757601280,negative,"They may have a SuperBowl in Dallas, but Dalla...",they may have superbowl in dallas but dallas a...


In [38]:
arr = combine_df[:21630]


In [39]:
arr.to_csv('converted.csv')

In [45]:
arr2 = combine_df[21630:]
arr2.to_csv('converted_test.csv')

In [57]:
features = arr.iloc[:,3].values
print(features)

['gas by my house hit am going to chapel hill on sat happy'
 'theo walcott is still shit watch rafa and johnny deal with him on saturday'
 'its not that am gsp fan just hate nick diaz cannot wait for february'
 ...
 'luca di montezemolo who last day was monday on why alonso is leaving ferrari'
 'coffee is pretty much the answer to all questions today friday tgif'
 'niki lauda justnfirmed to sky that alonso was released ofntact on thursday night']


In [63]:
labels = arr.iloc[:,1].values
print(lables)

['positive' 'negative' 'negative' ... 'neutral' 'positive' 'neutral']


In [61]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(features).toarray()

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

In [142]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(processed_features, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [89]:
X_test.shape

(4326, 2500)

In [66]:
predictions = text_classifier.predict(X_test)

In [67]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[ 163  375  163]
 [  69 1313  381]
 [  45  577 1240]]
              precision    recall  f1-score   support

    negative       0.59      0.23      0.33       701
     neutral       0.58      0.74      0.65      1763
    positive       0.70      0.67      0.68      1862

    accuracy                           0.63      4326
   macro avg       0.62      0.55      0.56      4326
weighted avg       0.63      0.63      0.61      4326

0.627831715210356


In [74]:
test_features = arr2.iloc[:,3].values
print(test_features)

['down in the atlantic city ventnor margate ocean city area am just waiting for the coordinator to hopefully call me tomorrow'
 'musical awareness great big beautiful tomorrow has an ending now is the time does not'
 'on radio fm fri oct labour analyst shawn hattingh cosatu role in thentext of unrest in the mining'
 ...
 'for the st time in years for your splendiferous entertainment arts in lechlade proudly presents an old'
 'nurses day may nursing the heart beat of the health'
 'we have minutes left until the nd episode of styled to rock uknavi rihannanavy']


In [90]:
processed_test_features = vectorizer.fit_transform(test_features).toarray()
processed_test_features.shape

(5398, 1699)

In [91]:
list1 = np.zeros((5398, 801))

In [92]:
processed_test_features = np.concatenate((processed_test_features, list1), axis = 1)
processed_test_features.shape

(5398, 2500)

In [143]:
predictions = text_classifier.predict(processed_test_features)

In [125]:
output = pd.DataFrame(predictions,columns = ['sentiment'])

In [129]:
output.to_csv('ans.csv')

In [130]:
test_df.to_csv('ans_label.csv')

In [131]:
test_df

Unnamed: 0,tweet_id
0,264238274963451904
1,218775148495515649
2,258965201766998017
3,262926411352903682
4,171874368908050432
...,...
5393,210378118865756160
5394,245177521304399872
5395,259280987089932288
5396,201113950211940352


In [144]:
test_df['sentiment'] = predictions

In [145]:
test_df

Unnamed: 0,tweet_id,sentiment
0,264238274963451904,neutral
1,218775148495515649,neutral
2,258965201766998017,positive
3,262926411352903682,neutral
4,171874368908050432,negative
...,...,...
5393,210378118865756160,positive
5394,245177521304399872,neutral
5395,259280987089932288,negative
5396,201113950211940352,neutral


In [146]:
test_list = []
heading = ['tweet_id', 'sentiment']
test_list.append(heading)
for i in range(len(test_df['tweet_id'])):
    sub = []
    sub.append(test_df['tweet_id'][i])
    sub.append(test_df['sentiment'][i])
    test_list.append(sub)

In [147]:
test_list

[['tweet_id', 'sentiment'],
 [264238274963451904, 'neutral'],
 [218775148495515649, 'neutral'],
 [258965201766998017, 'positive'],
 [262926411352903682, 'neutral'],
 [171874368908050432, 'negative'],
 [256010056942903296, 'positive'],
 [253809989599232000, 'positive'],
 [261776619146985472, 'neutral'],
 [264143999374356481, 'neutral'],
 [223052929131757571, 'positive'],
 [264088575476391936, 'neutral'],
 [264030422244147200, 'neutral'],
 [263720820428394496, 'neutral'],
 [258639996037259264, 'neutral'],
 [250981027969896448, 'positive'],
 [195554987919679488, 'neutral'],
 [264225147542716416, 'neutral'],
 [260536025800142848, 'positive'],
 [192349690921103360, 'neutral'],
 [263473556753620993, 'positive'],
 [263182075434254336, 'positive'],
 [262434763493748736, 'neutral'],
 [256514267149385729, 'neutral'],
 [263733718085140481, 'negative'],
 [241403423130525699, 'neutral'],
 [263690432473624578, 'neutral'],
 [260072342905049088, 'neutral'],
 [259780914509389824, 'neutral'],
 [26420868

In [148]:
len(test_list)

5399

In [149]:
import csv
with open('test_result2.csv', 'w', newline='') as fp:
    a = csv.writer(fp, delimiter = ",")
    data = test_list
    a.writerows(data)

In [150]:
check = pd.read_csv("test_result2.csv")

In [151]:
check.head()

Unnamed: 0,tweet_id,sentiment
0,264238274963451904,neutral
1,218775148495515649,neutral
2,258965201766998017,positive
3,262926411352903682,neutral
4,171874368908050432,negative
