In [1]:
#Import libraires
import spacy
import gensim
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore")
import pyLDAvis
import pyLDAvis.gensim
from afinn import Afinn
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
#Load and view dataset
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1


In [3]:
#Convert to string
df['tweet'] = df['tweet'].astype('str')

#Change encoding of df['tweet']
df['tweet'] = df['tweet'].apply(lambda x: x.encode('ascii', 'ignore'))
df['tweet'] = df['tweet'].apply(lambda x: x.decode('utf-8'))

In [4]:
#Drop null values
df = df.dropna()

In [5]:
#Convert to lower case
df['tweet'] = df['tweet'].apply(lambda x: x.lower())

In [6]:
#Convert tweets into list
tweets = df['tweet'].tolist()

In [7]:
#View all hashtags in the provided tweets
hashtags = []
for tweet in tweets:
    hashtag = re.findall(r"#(\w+)", tweet)
    hashtags.append(hashtag)
    
flat_list = [item for sublist in hashtags for item in sublist]
flat_list = [x.lower() for x in flat_list]

In [8]:
#Get unique list of hashtags
hashtags_unique = []
for x in flat_list:
    if x not in hashtags_unique:
        hashtags_unique.append(x)

In [9]:
#Initialize an afinn lexicon
afinn = Afinn()

In [10]:
#View the sentiment score for each hashtag
afinn_scores = [afinn.score(text) for text in hashtags_unique]

In [11]:
#Make a df of afinn scores
afinn_df = pd.DataFrame(afinn_scores)
afinn_df['hashtags'] = hashtags_unique

In [12]:
#UnImportant hashtags
afinn_df = afinn_df[afinn_df[0] == 0]

In [13]:
#Hashtags to include as stopwords
hashtag_stop = afinn_df['hashtags'].tolist()

In [14]:
# function to remove user handles
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for match in r:
        input_txt = re.sub(match, '', input_txt)
    return input_txt

In [15]:
#Remove user handles
df['clean_tweets'] = df['tweet'].apply(lambda row: remove_pattern(row, "@[\w]*"))

  df['clean_tweets'] = df['tweet'].apply(lambda row: remove_pattern(row, "@[\w]*"))


In [16]:
#Make list of stopwords
stop_words = list(punctuation) + hashtag_stop + list(["{link}"])
stop_words.append('link')
stop_words.append('amp')
stop_words.append("'s")
stop_words.append(".")
stop_words.append("'s")
stop_words.append("_")
stop_words.append("'s")
stop_words.append("/")
stop_words.append("rt")

In [17]:
#remove words with less than 3 characters
#df['clean_tweets'] = df['clean_tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

In [18]:
# tokenize
df['tokenized_text'] = df['clean_tweets'].apply(lambda row: nltk.word_tokenize(row))

In [19]:
# stopword removal
df['tokenized_text'] = df['tokenized_text'].apply(lambda row : [word for word in row if word not in stop_words])

In [20]:
#Initialize a lemmatizer
lemma = WordNetLemmatizer()

In [21]:
#Lemmatize the tweets
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [lemma.lemmatize(i) for i in x])

In [22]:
#Convert into bag of words
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: ' '.join(x))

In [23]:
df['tokenized_text']

0       defining language of touch with different dial...
1       learning ab doodle all doodle should be light ...
2       one of the most in-your-face ex of stealing th...
3       this would b pretty awesome if it did n't cras...
4                             outside the waiting for the
5                                    one lone dude awaits
6                 tip prince toy shopping with zuckerberg
7       nu user for now the includes uberguide to spon...
8                                         free sampler on
9       think might go all without seeing the same cas...
10                              official go bit.ly/hmiiga
11                               it official 'm buying an
12                         they 're giving away x box and
13      we 're officially come by the grill mention w/...
14                     to watch from the trade show floor
15      marissa future of augmented reality contextual...
16      dl the to get into casa the free is available ...
17      well y

In [24]:
#Drop unnecessary columns
df = df.drop(['tweet', 'clean_tweets'], 1)

In [25]:
df = df.dropna()

In [26]:
#Save file
df.to_csv('clean_train4.csv', index = False)