# Import


In [None]:
import pickle
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import string
import re
import nltk
import matplotlib.pyplot as plt
import matplotlib.axes as axes
import seaborn as sb
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier
nltk.download('stopwords')
nltk.download('wordnet')

# Data Management


In [50]:
df = pd.read_csv('sentiment_140.csv', sep=',', names=[
                 "target", "ids", "date", "user", "text"])

In [51]:
df.head(10)


Unnamed: 0,target,ids,date,user,text
0,0,1467810369,Apr 06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Apr 06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Apr 06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Apr 06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Apr 06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Apr 06 22:20:00,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Apr 06 22:20:03,mybirch,Need a hug
7,0,1467811594,Apr 06 22:20:03,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Apr 06 22:20:05,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Apr 06 22:20:09,mimismo,@twittera que me muera ?


In [52]:
df.shape


(1600000, 5)

In [53]:
df.groupby('target').describe()


Unnamed: 0_level_0,ids,ids,ids,ids,ids,ids,ids,ids
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,800000.0,2049457000.0,202584800.0,1467810000.0,1971571000.0,2057298000.0,2220801000.0,2329206000.0
4,800000.0,1948178000.0,169629100.0,1467822000.0,1879943000.0,1985419000.0,2054322000.0,2193602000.0


# Statistics


In [None]:
# TODO Plot stats about the data

# Data CleanUp


In [56]:
regex_urls = r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*'
regex_urls = r'((www.\S+)|(https?://\S+))'
regex_usernames = r'(@(\s)*[a-zA-Z0-9_]*)'
punct = set(string.punctuation)
stopw = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [57]:
def apply_cleanup(text):
    # Remove URLs
    text = " ".join([word for word in text.split()
                    if not re.match(regex_urls, word)])
    # Remove User-Handles (@username)
    text = " ".join([word for word in text.split()
                    if not re.match(regex_usernames, word)])
    # Remove Numbers
    text = re.sub(r'[0-9]\S+', '', text)
    # Remove punctuation
    text = "".join([character.lower()
                   for character in text if character not in punct])
    # Lemmatize a word if it is not a stopword. When it is a stopword -> remove
    text = ' '.join([lemmatizer.lemmatize(word, "v")
                    for word in text.split() if word not in stopw])
    return text

In [58]:
# Apply preprovessing pipeline
df['text'] = df['text'].apply(apply_cleanup)

In [60]:
df['text'].head(10)

0    awww thats bummer shoulda get david carr third...
1    upset cant update facebook texting might cry r...
2        dive many time ball manage save rest go bound
3                      whole body feel itchy like fire
4                               behave im mad cant see
5                                           whole crew
6                                             need hug
7    hey long time see yes rain bite bite lol im fi...
8                                           nope didnt
9                                            que muera
Name: text, dtype: object

# Model


In [61]:
x = df['text']
y = df['target']

In [62]:
# Split data into data for test training
# Shuffle=True because original data is sorted by column 'target'

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=42, shuffle=True)

In [63]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='word')),
    ('transformer', TfidfTransformer()),
    ('model', GradientBoostingClassifier(random_state=42))
])
set_config(display="diagram")
pipeline

In [None]:
set_config(display="text")
pipeline.fit(x_train, y_train)

In [None]:
y_predicted = pipeline.predict(x_test)

# Evaluation


In [None]:
print("accuracy | " + str(accuracy_score(y_test, y_predicted)))

accuracy | 0.6938939393939394


In [None]:
print(confusion_matrix(y_test, y_predicted))
# TODO Analyze values

[[140973 122348]
 [ 39276 225403]]


In [None]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.78      0.54      0.64    263321
           4       0.65      0.85      0.74    264679

    accuracy                           0.69    528000
   macro avg       0.72      0.69      0.69    528000
weighted avg       0.71      0.69      0.69    528000



# Pickle


In [None]:
# TODO Export pipeline with Pickle 