In [45]:
#Import libraires
import spacy
import gensim
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore")
import pyLDAvis
import pyLDAvis.gensim
from afinn import Afinn
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [27]:
#Load and view dataset
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1


In [28]:
#Convert to string
df['tweet'] = df['tweet'].astype('str')

#Change encoding of df['tweet']
df['tweet'] = df['tweet'].apply(lambda x: x.encode('ascii', 'ignore'))
df['tweet'] = df['tweet'].apply(lambda x: x.decode('utf-8'))

In [29]:
#Drop null values
df = df.dropna()

In [30]:
#Convert to lower case
df['tweet'] = df['tweet'].apply(lambda x: x.lower())

In [31]:
#Convert tweets into list
tweets = df['tweet'].tolist()

In [32]:
#View all hashtags in the provided tweets
hashtags = []
for tweet in tweets:
    hashtag = re.findall(r"#(\w+)", tweet)
    hashtags.append(hashtag)
    
flat_list = [item for sublist in hashtags for item in sublist]
flat_list = [x.lower() for x in flat_list]

In [33]:
#Get unique list of hashtags
hashtags_unique = []
for x in flat_list:
    if x not in hashtags_unique:
        hashtags_unique.append(x)

In [34]:
#Initialize an afinn lexicon
afinn = Afinn()

In [35]:
#View the sentiment score for each hashtag
afinn_scores = [afinn.score(text) for text in hashtags_unique]

In [36]:
#Make a df of afinn scores
afinn_df = pd.DataFrame(afinn_scores)
afinn_df['hashtags'] = hashtags_unique

In [37]:
#UnImportant hashtags
afinn_df = afinn_df[afinn_df[0] == 0]

In [38]:
#Hashtags to include as stopwords
hashtag_stop = afinn_df['hashtags'].tolist()

In [39]:
# function to remove user handles
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for match in r:
        input_txt = re.sub(match, '', input_txt)
    return input_txt

In [40]:
#Remove user handles
df['clean_tweets'] = df['tweet'].apply(lambda row: remove_pattern(row, "@[\w]*"))

  df['clean_tweets'] = df['tweet'].apply(lambda row: remove_pattern(row, "@[\w]*"))


In [41]:
#Make list of stopwords
stop_words = list(punctuation) + list(["{link}"])
stop_words.append('link')
stop_words.append('amp')
stop_words.append("'s")
stop_words.append(".")
stop_words.append("'s")
stop_words.append("_")
stop_words.append("'s")
stop_words.append("/")
stop_words.append("rt")

In [42]:
#remove words with less than 3 characters
#df['clean_tweets'] = df['clean_tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

In [43]:
# tokenize
df['tokenized_text'] = df['clean_tweets'].apply(lambda row: nltk.word_tokenize(row))

In [44]:
# stopword removal
df['tokenized_text'] = df['tokenized_text'].apply(lambda row : [word for word in row if word not in stop_words])

In [46]:
#Initialize a lemmatizer
lemma = WordNetLemmatizer()

In [47]:
stemma = PorterStemmer()

In [48]:
#Lemmatize the tweets
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [stemma.stem(i) for i in x])

In [49]:
#Convert into bag of words
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: ' '.join(x))

In [50]:
df['tokenized_text']

0       sxswnui sxsw appl defin languag of touch with ...
1       learn ab googl doodl all doodl should be light...
2       one of the most in-your-fac ex of steal the sh...
3       thi iphon sxsw app would b pretti awesom if it...
4       line outsid the appl store in austin wait for ...
5       technew one lone dude await ipad 2 at appl sxs...
6       sxsw tip princ npr video toy shop with zuckerb...
7       nu user new ubersoci for iphon now in the app ...
8                       free sxsw sampler on itun freemus
9       i think i might go all weekend without see the...
10      offici sxsw app sxsw go bit.ly/hmiiga android ...
11                   it offici i 'm buy an ipad sxsw elev
12      they 're give away ipad 2 x box and book at sx...
13      we 're offici at sxsw come by the grill mentio...
14      compani to watch from the sxsw trade show floo...
15      googl marissa mayer futur of locat augment rea...
16      dl the calyp app to get into calyp casa at sxs...
17      well y

In [51]:
#Drop unnecessary columns
df = df.drop(['tweet', 'clean_tweets'], 1)

In [65]:
df = df.dropna()

In [52]:
#Save file
df.to_csv('train_clean.csv', index = False)