In [1]:
#Import the packages we use

#General packages I use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

#Import NTLK packages for NLP work, for preprocessing our text data
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#SKLEARN packages 
from sklearn.model_selection import train_test_split #the most important one!!
from nltk.classify import NaiveBayesClassifier
from sklearn.naive_bayes import MultinomialNB 

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#import our data into a Pandas DataFrame
df = pd.read_csv('data/Tweets.csv')

#Feature Select our columns, we will be using text and airline sentiment
df = df[['text','airline_sentiment']]
df.head(3)

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral


# NLP Work: We will need to do some preprocessing of our text 

#### The Standard procedure is, and that I use. 

1. Tokenization
2. Remove Stop words
3. Lemmatization


In [3]:
#Now let us create a few functions to faciliate our pre processing of our text data

def nltk_tokenizer(text):
    '''
    Takes in text data. 
    Splits up the words and makes a list for each individual word.
    Each word is then reffered to as token
    '''
    return word_tokenize(text)

#Load up our stop words
stop_words = stopwords.words('english')
#Adds stuff to our stop wors list
stop_words.extend(["@","n't",'.',','])

def nltk_remove_stopwords(text):
    """
    Litearlly removes stopwords
    """
    
    cleaned_text = [] 
    
    for token in text: 
        if token not in stop_words and len(token) > 1: 
            cleaned_text.append(token)
            
    return cleaned_text
        
        #the_tokens = [token for token in the_tokens if token not in stop_words]

def nltk_wn_lemmatizer(token_list):
    '''
    Taking in input in list form
    Lemmatization is the process of converting a word to its base form.
    '''
    lemmatized_list = []
    
    for i in token_list:
        token = WordNetLemmatizer().lemmatize(i)
        lemmatized_list.append(token)
    
    return lemmatized_list


# We need to do this for our Machine Learning Algorithims Later
def the_untokenizer(token_list):
        '''
        Returns all the tokenized words in the list to one string. 
        Used after the pre processing, such as removing stopwords, and lemmatizing. 
        '''
        return " ".join(token_list)


In [4]:
# We created all these helper function, now we need to create a final fucntion that essentially 
# Cleans up our tweets

def cleaning_our_tweets(text):
    
    cleaned_tweets = []
    num_tweets = len(text)
    
    for i in range(num_tweets):
        tweet = text[i]
        tokenized_list = nltk_tokenizer(tweet)
        removed_stopwords = nltk_remove_stopwords(tokenized_list)
        lemmatized_words = nltk_wn_lemmatizer(removed_stopwords)
        back_to_string = the_untokenizer(lemmatized_words)
        
        cleaned_tweets.append(back_to_string)
    
    return cleaned_tweets
    

In [5]:
df['cleaned_tweets'] = cleaning_our_tweets(df['text'])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df.cleaned_tweets, df.airline_sentiment, test_size = 0.2 )

In [52]:
cv = CountVectorizer()
tv = TfidfVectorizer()

X_train = tv.fit_transform(X_train)
X_test = tv.fit_transform(X_test)


In [53]:
multi_nb = MultinomialNB()
multi_nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [54]:
train_pred = multi_nb.predict(X_train)
test_pred = multi_nb.predict(X_test)

ValueError: dimension mismatch

In [48]:
print(accuracy_score(y_train, train_pred))

0.7331378299120235
