In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import re, string, unicodedata
import sys
!{sys.executable} -m pip install contractions
import contractions

PATH = '../data/Reviews.csv'
sns.set(color_codes=True)

[nltk_data] Downloading package punkt to /Users/tesfami1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tesfami1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### Helper methods for Text Normalizer

In [2]:

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def normalize(words):
    """Apply the above functions"""
    words = to_lowercase(words)
    words = remove_punctuation(words)
#     words = remove_stopwords(words)
    words = lemmatize_verbs(words)
    return words


### In Progess(80%) XGBoost Model (using only 30,000 rows for now)

In [26]:

class XGBoostModel:
    def __init__(self, vectorizer, path=''):
        """read data as a dataframe"""
        self.path = path or PATH
        self.data = pd.read_csv(self.path)
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.vect = None
        self.vect_type = vectorizer
        self.model = None
        
    # Data pre-proccessing section
    
    def __prepareData__(self):
        """remove unneccessary columns and rows"""
        self.data.dropna(inplace=True)
        self.data = self.data.iloc[:30000,:]
        self.data = self.data[self.data['Score'] != 3]
        self.data = self.data[self.data['HelpfulnessNumerator']!=0]
        self.data['Positivity'] = np.where(self.data['Score'] > 3, 1, 0)
      
    def __splitDataFromTarget__(self):
        """split target column from the given data"""
        self.__prepareData__()
        X,y = self.data.iloc[:,:-1], self.data.iloc[:,-1]
        return X, y
    
    def __trainTestSplit__(self):
        """split data into training and testing data"""
        X, y = self.__splitDataFromTarget__()
        X_train, X_test, y_train, y_test = train_test_split(X['Text'], y, test_size = 0.2, random_state = 15)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
    
    def tokenizeNormalize(self, string):
        """chop each statement into words but before that fix contactions"""
        string = replace_contractions(string)
        words = nltk.word_tokenize(string)
        return normalize(words)

    def count_vectorizer(self, data):
        """Using simple counting to prepare bag of words"""
        stop_words = ['this','that','a','i','he','she','they','we']
#         vectorizer = CountVectorizer(min_df = 1, ngram_range=(1,2), stop_words='english', tokenizer=self.tokenizeNormalize)
        vectorizer = CountVectorizer(min_df = 1, ngram_range=(1,2), stop_words=stop_words)
        self.vect = vectorizer.fit(self.X_train)
        data_vectorized = self.vect.transform(data)
        return data_vectorized
    
    def tf_idf_vectorizer(self, data):
        """Use the TF*IDF method to prepare bag of words"""
#         tf_vect = TfidfVectorizer(stop_words=stopwords.words('english'), tokenizer=self.tokenizeNormalize).fit(X_train)
        stop_words = ['this','that','a','i','he','she','they','we']
        self.vect = TfidfVectorizer(stop_words=stop_words,tokenizer=self.tokenizeNormalize).fit(self.X_train)
        data_tf_vectorized = self.vect.transform(data)
        return data_tf_vectorized
    
    def quickView(self):
        """nice table view of first 5 rows of the data"""
        self.data.head()
        
    
    #XGBoost provides a wrapper class to allow models 
    #to be treated like classifiers or regressors in the scikit-learn framework.
    #Models are fit using the scikit-learn API and the model.fit() function.
    

    # Model trainer and testing functions
    
    def plain_train(self,params={}):
        """train model without cross-validations."""
        self.__trainTestSplit__()
        self.X_train = self.count_vectorizer(self.X_train) if self.vect_type == 'COUNT' else \
                        (self.tf_idf_vectorizer(self.X_train) if self.vect_type == 'TF_IDF' else None)
        if self.X_train is None:
            raise ValueError('vectorizer type cannot be none. It should be COUNT or TF_IDF.')
        else:                                                                        
            params = params or {'objective':'binary:logistic', 'colsample_bytree':0.3, 'max_depth':5, 'reg_alpha':10}
            self.model = xgb.XGBClassifier(**params)
            self.model.fit(self.X_train, self.y_train)
            print('Training models without cross-validation finished. Run tests!')

    def test_model(self):
        """test model and compute error"""
        if self.model is not None:
            predictions = self.model.predict(self.vect.transform(self.X_test))
            print(f'Accuracy: {(self.y_test==predictions).sum()*100/self.y_test.shape[0]}%')
        else:
            print("Error: prepare data and train model first. :[ ")
    def cv_train(self,params={}):
        """train model with 3-fold cross-validations"""
        X, y = __splitData__()
        data_dmatrix = xgb.DMatrix(data=X,label=y)
        params = params or {}
        cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="auc", as_pandas=True, seed=123)
        print(cv_results)
        

### Test 1 (when using simple count vectorizer)

In [27]:
# model test using simple count_vectorizer
model = XGBoostModel("COUNT")
model.plain_train()

Training models without cross-validation finished. Run tests!


In [34]:
# testing with user input corpus
test_corpus = ['I like this product a lot.', 'I love it', 'It is so good','I will not buy it again', 'I hate this product. I am never going to get it']
model.model.predict(model.vect.transform(test_corpus))

  if diff:


array([1, 1, 1, 0, 1])

In [36]:
# testing using x_test
model.test_model()

Accuracy: 85.05050505050505%


  if diff:


### Test 2 (when using TF_IDF vectorizer)

In [37]:
#model test using td*idf vectorizer
model2 = XGBoostModel("TF_IDF")
model2.plain_train()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Training models without cross-validation finished. Run tests!


In [39]:
test_corpus2 = ['I like this product a lot.', 'I hate this product. I am never going to get it again', 'I love it', 'It is so good','I will not buy it again']
model2.model.predict(model2.vect.transform(test_corpus2))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if diff:


array([1, 1, 1, 1, 0])

In [40]:
# testing using x_test
model2.test_model()

Accuracy: 86.1010101010101%


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if diff:


### Test 3 ( when 3-fold cross-validation is used)

In [None]:
model3

### In Progress (data visualization)

In [3]:
# class for Exploratory Data Analysis
#In Progress
class ED_analysis:
    def __init__(self, path=''):
        self.dframe = pd.read_csv(path or PATH)
        
    #prepare data for feature analysis
    
    def __preprocess__(self):
        self.dframe.dropna(inplace=True)
    def dist(self,colName,title):
        # Histogram of var
        #sns.distplot(data[colName].dropna());
        plt.style.use('fivethirtyeight')
        plt.hist(self.dframe[colName].dropna(), bins = 100, edgecolor = 'k')
        plt.xlabel(colName)
        plt.ylabel('Number of Reviews')
        plt.title(title)
    def scatterPlot(self);
    def densityPlot(self,colName,catVar=None);
    def parisPlot(self);

    