# Data Preprocessing
1. Import proper packages and open files
2. Clean data
3. Create bag of words and lemmatize
4. Vectorize into one, two, and three words phrases

## Step One: Import proper packages and open files

In [1]:
import pandas as pd
import io
import numpy as np
import csv

In [84]:
train = pd.read_csv('training_data.csv')
judging = pd.read_csv('contestant_judgment.csv')

In [33]:
train.head()

Unnamed: 0,ID,User,Text,Sentiment
0,864192,Carly_FTS,I *heart* filling up @dennisschaub desk 1 it...,1
1,523691,Open_Sourcing,"#SocioMat - people create prettier, younger an...",1
2,584154,xxcharlx,no way i dont want the tour to end,0
3,1527961,andreapuddu,@HemalRadia Hi Amazing Brother! Sending Limitl...,1
4,28609,umbec,@flockmaster they are chocolate,1


In [4]:
import nltk
import re
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /Users/Jenny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Jenny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Jenny/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
print(len(train.at[100000, 'Text']))
print(train.at[100000, 'Text'])

86
@SorenBlackgaard And I think you meant to say, &quot;...to eclipse the Eclipse?&quot; 


## Step Two: Clean Data

In [85]:
stop_words = set(stopwords.words('english'))
filtered_sent = []

for index, row in train.iterrows():
    tokenized_words = []
    # Remove @s
    train.at[index, 'Text'] = re.sub(r"@[A-z]+", " ", train.at[index, 'Text'])
    
    # Remove punct
    train.at[index, 'Text'] = re.sub(r"\W", " ", train.at[index, 'Text'])

    # Remove numbers
    train.at[index, 'Text'] = re.sub(r"\d+", "", train.at[index, 'Text'])

    # Remove spaces
    train.at[index, 'Text'] = re.sub(r"\s+", " ", train.at[index, 'Text'])
   
    # Make lowercase
    train.at[index, 'Text'] = str(train.at[index, 'Text']).lower()
    
    # Tokenize
    tokenized_words = nltk.word_tokenize(train.at[index, 'Text'])
    
    filtered_sent.append([])
    for w in tokenized_words:
        if w not in stop_words:
            filtered_sent[index].append(w)

Unnamed: 0,ID,User,Text,Sentiment
0,864192,Carly_FTS,i heart filling up desk it means sales amp it ...,1
1,523691,Open_Sourcing,sociomat people create prettier younger and b...,1
2,584154,xxcharlx,no way i dont want the tour to end,0
3,1527961,andreapuddu,hi amazing brother sending limitless love you...,1
4,28609,umbec,they are chocolate,1


In [86]:
print(filtered_sent[1])

['sociomat', 'people', 'create', 'prettier', 'younger', 'better', 'looking', 'avatars']
['i heart', 'heart filling', 'filling up', 'up desk', 'desk it', 'it means', 'means sales', 'sales amp', 'amp it', 'it s', 's off', 'off my', 'my desk']


## Step three: Lemmatize bag of words

In [87]:
lemmatizer = WordNetLemmatizer()
i = 0
for words in filtered_sent:
    newWords = []
    
    for word in words:
        newWords.append(lemmatizer.lemmatize(word, pos = 'v'))
     
    filtered_sent[i] = " ".join(newWords)
    i += 1

In [89]:
print(filtered_sent[0])
train.head()

heart fill desk mean sales amp desk


Unnamed: 0,ID,User,Text,Sentiment
0,864192,Carly_FTS,i heart filling up desk it means sales amp it ...,1
1,523691,Open_Sourcing,sociomat people create prettier younger and b...,1
2,584154,xxcharlx,no way i dont want the tour to end,0
3,1527961,andreapuddu,hi amazing brother sending limitless love you...,1
4,28609,umbec,they are chocolate,1


## Step Four: Vectorize

In [90]:
new_column = pd.DataFrame({'BagofWords': filtered_sent})
train = train.merge(new_column, right_index = True, left_index=True)
train.head()

Unnamed: 0,ID,User,Text,Sentiment,BagofWords
0,864192,Carly_FTS,i heart filling up desk it means sales amp it ...,1,heart fill desk mean sales amp desk
1,523691,Open_Sourcing,sociomat people create prettier younger and b...,1,sociomat people create prettier younger better...
2,584154,xxcharlx,no way i dont want the tour to end,0,way dont want tour end
3,1527961,andreapuddu,hi amazing brother sending limitless love you...,1,hi amaze brother send limitless love way twitt...
4,28609,umbec,they are chocolate,1,chocolate


In [91]:
bagofwords = train.BagofWords.tolist()
sentiment = train.Sentiment.tolist()
print(bagofwords[1])

sociomat people create prettier younger better look avatars


In [93]:
cv = CountVectorizer(max_features=750)
cv2 = CountVectorizer(max_features=750, ngram_range=(2, 3))

x_1 = cv.fit_transform(train['BagofWords']).toarray()
x_2 = cv2.fit_transform(train['Text']).toarray()
y = train['Sentiment'].values

header_1 = cv.get_feature_names()
header_2 = cv2.get_feature_names()
output_1 = pd.DataFrame(x_1, columns = header_1)
output_2 = pd.DataFrame(x_2, columns = header_2)
output_1 = output_1.merge(output_2, right_index=True, left_index=True)
output_1['Sentiment'] = train['Sentiment']

output_1.head()

Unnamed: 0,able,absolutely,account,actually,add,afternoon,age,ago,agree,ah,...,you should,you so,you think,you to,you too,you ve,you want,you were,you will,Sentiment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [94]:
output_1.tail()

Unnamed: 0,able,absolutely,account,actually,add,afternoon,age,ago,agree,ah,...,you should,you so,you think,you to,you too,you ve,you want,you were,you will,Sentiment
999995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
999996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
999997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
999998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
