### 1. Importing Libraries

In [47]:
import pandas
import matplotlib.pyplot as plt
import string

import re
from string import punctuation
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### 2. Reading the csv file and converting into Pandas Dataframe

In [2]:
data = pd.read_csv('train.tsv', delimiter='\t', encoding='utf-8')
data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
dsp = pd.DataFrame(data['item_description'])
dsp.head()

Unnamed: 0,item_description
0,No description yet
1,This keyboard is in great condition and works ...
2,Adorable top with a hint of lace and a key hol...
3,New with tags. Leather horses. Retail for [rm]...
4,Complete with certificate of authenticity


In [4]:
dsp = dsp.fillna('Missing')
dsp.isnull().sum()

item_description    0
dtype: int64

In [5]:
dsp['item_condition_id'] = data['item_condition_id']

In [None]:
def processItemCond(cond):
    if cond > 2:
        cond = 1
    else:
        cond = 0
        
    return cond    

binary_cond = []
for cond in dsp['item_condition_id']:
    binary_cond.append(processItemCond(cond)) 
    
dsp['binary item condition'] = binary_cond
dsp.head()

### 3. Cleaning up Item Description Text

In [6]:
# helper function to clean tweets
def processDescription(text):
    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    #Convert @username to AT_USER
    text = re.sub('@[^\s]+','',text)
    # Remove tickers
    text = re.sub(r'\$\w*', '', text)
    # To lowercase
    text = text.lower()
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    # Remove hashtags
    text = re.sub(r'#\w*', '', text)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    text = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', text)
    # Remove words with 2 or fewer letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+', ' ', text)
    # Remove single space remaining at the front of the tweet.
    text = text.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text = ''.join(c for c in text if c <= '\uFFFF') 
    return text
# ______________________________________________________________
# clean dataframe's text column
dsp['item_description'] = dsp['item_description'].apply(processDescription)

# preview some cleaned tweets
dsp.head()

Unnamed: 0,item_description,item_condition_id
0,description yet,3
1,this keyboard great condition and works like c...,3
2,adorable top with hint lace and key hole the b...,1
3,new with tags leather horses retail for each s...,1
4,complete with certificate authenticity,1
5,banana republic bottoms candies skirt with mat...,3
6,size small but straps slightly shortened fit b...,3
7,you get three pairs sophie cheer shorts size s...,3
8,girls size small plus green three shorts total,3
9,realized his pants are backwards after the pic...,3


### 4. Counting no. of words in each tweet

In [7]:
# get a word count per sentence column
def count_no_of_words(sentence):
    return len(sentence.split())
    
dsp['word count'] = dsp['item_description'].apply(count_no_of_words)
dsp.head()

Unnamed: 0,item_description,item_condition_id,word count
0,description yet,3,2
1,this keyboard great condition and works like c...,3,29
2,adorable top with hint lace and key hole the b...,1,18
3,new with tags leather horses retail for each s...,1,27
4,complete with certificate authenticity,1,4


In [8]:
# get most common words in training dataset
all_words = []
for line in list(dsp['item_description']):
    words = line.split()
    for word in words:
        all_words.append(word.lower())
    
    
Counter(all_words).most_common(10)

[('and', 842727),
 ('the', 651761),
 ('for', 547873),
 ('new', 497016),
 ('with', 466338),
 ('size', 453835),
 ('brand', 271092),
 ('you', 268868),
 ('free', 267106),
 ('condition', 255439)]

### 5. Removing stopwords and Tokenization

In [9]:
# show stop words examples

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]



In [10]:
# tokenize helper function
def text_process(raw_text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in list(raw_text) if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in stop_words]


# tokenize message column and create a column for tokens
dsp['tokens'] = dsp['item_description'].apply(text_process) # tokenize style 1

dsp.head()

Unnamed: 0,item_description,item_condition_id,word count,tokens
0,description yet,3,2,"[description, yet]"
1,this keyboard great condition and works like c...,3,29,"[keyboard, great, condition, works, like, came..."
2,adorable top with hint lace and key hole the b...,1,18,"[adorable, top, hint, lace, key, hole, back, p..."
3,new with tags leather horses retail for each s...,1,27,"[new, tags, leather, horses, retail, stand, fo..."
4,complete with certificate authenticity,1,4,"[complete, certificate, authenticity]"


In [11]:
# split sentences to get individual words
all_words = []
for line in dsp['tokens']: # try 'tokens'
    all_words.extend(line)
    
# create a word frequency dictionary
wordfreq = Counter(all_words)
wordfreq

Counter({'description': 91863,
         'yet': 85151,
         'keyboard': 1165,
         'great': 152937,
         'condition': 255439,
         'works': 28291,
         'like': 114422,
         'came': 5561,
         'box': 101506,
         'ports': 1266,
         'tested': 8338,
         'work': 17310,
         'perfectly': 8222,
         'lights': 3712,
         'customizable': 262,
         'via': 4721,
         'razer': 65,
         'synapse': 4,
         'app': 3402,
         'adorable': 10071,
         'top': 65924,
         'hint': 556,
         'lace': 22247,
         'key': 5335,
         'hole': 4577,
         'back': 61490,
         'pale': 2462,
         'pink': 122339,
         'also': 44010,
         'available': 37985,
         'white': 83112,
         'new': 497016,
         'tags': 100914,
         'leather': 28513,
         'horses': 297,
         'retail': 28847,
         'stand': 4700,
         'foot': 3167,
         'high': 40979,
         'sold': 20274,
        

### 6. Vectorization and Transformation of Count Matrix

In [12]:
# vetorize
bow_transformer = CountVectorizer(analyzer=text_process).fit(dsp['item_description'])
# print total number of vocab words
print(len(bow_transformer.vocabulary_))

180928


In [13]:
#entire word vocabulary 
bow_transformer.vocabulary_

{'description': 51531,
 'yet': 160489,
 'keyboard': 84918,
 'great': 70815,
 'condition': 44904,
 'works': 158805,
 'like': 89790,
 'came': 37686,
 'box': 33868,
 'ports': 116300,
 'tested': 145288,
 'work': 158753,
 'perfectly': 112404,
 'lights': 89689,
 'customizable': 48615,
 'via': 154075,
 'razer': 121295,
 'synapse': 142985,
 'app': 24011,
 'adorable': 20479,
 'top': 147760,
 'hint': 75143,
 'lace': 87165,
 'key': 84913,
 'hole': 75527,
 'back': 27421,
 'pale': 110429,
 'pink': 113955,
 'also': 22213,
 'available': 26488,
 'white': 157286,
 'new': 104133,
 'tags': 143571,
 'leather': 88512,
 'horses': 76109,
 'retail': 123939,
 'stand': 138809,
 'foot': 65117,
 'high': 74854,
 'sold': 136288,
 'pair': 110337,
 'questions': 120063,
 'please': 114880,
 'ask': 25394,
 'free': 65929,
 'shipping': 131883,
 'got': 70284,
 'storage': 139872,
 'complete': 44571,
 'certificate': 39816,
 'authenticity': 26292,
 'banana': 28094,
 'republic': 123467,
 'bottoms': 33652,
 'candies': 37937,
 '

In [14]:
# transform the entire DataFrame of messages
sentiment_text_bow = bow_transformer.transform(dsp['item_description'])

# check out the bag-of-words counts for the entire corpus as a large sparse matrix
print('Shape of Sparse Matrix: ', sentiment_text_bow.shape)
print('Amount of Non-Zero occurences: ', sentiment_text_bow.nnz)

Shape of Sparse Matrix:  (1482535, 180928)
Amount of Non-Zero occurences:  21861730


In [15]:
# from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(sentiment_text_bow)

# to transform the entire bag-of-words corpus
sentiment_text_tfidf = tfidf_transformer.transform(sentiment_text_bow)
print(sentiment_text_tfidf.shape)

(1482535, 180928)


### Multinomial Naive Bayes

In [41]:
# Run Train Data Through Pipeline analyzer=text_process
# uncomment below to train on a larger dataset but it is very slow for a regular laptop

X_train, X_test, y_train, y_test = train_test_split(dsp['item_description'][:100000], dsp['binary item condition'][:100000], test_size=0.2)



# create pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(strip_accents='ascii',
                            stop_words='english',
                            lowercase=True)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

# this is where we define the values for GridSearchCV to iterate over
parameters = {'bow__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'classifier__alpha': (1e-2, 1e-3),
             }

# do 2-fold cross validation for each of the 8 possible combinations of the above params
grid = GridSearchCV(pipeline, cv=2, param_grid=parameters, verbose=1)
grid.fit(X_train,y_train)

# summarize results
print("\nBest Model: %f using %s" % (grid.best_score_, grid.best_params_))
print('\n')
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("Mean: %f Stdev:(%f) with: %r" % (mean, stdev, param))

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:  1.0min finished



Best Model: 0.793175 using {'bow__ngram_range': (1, 2), 'classifier__alpha': 0.01, 'tfidf__use_idf': False}


Mean: 0.772887 Stdev:(0.000093) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.01, 'tfidf__use_idf': True}
Mean: 0.787512 Stdev:(0.000207) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.01, 'tfidf__use_idf': False}
Mean: 0.768687 Stdev:(0.000518) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.001, 'tfidf__use_idf': True}
Mean: 0.782787 Stdev:(0.000032) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.001, 'tfidf__use_idf': False}
Mean: 0.779125 Stdev:(0.000481) with: {'bow__ngram_range': (1, 2), 'classifier__alpha': 0.01, 'tfidf__use_idf': True}
Mean: 0.793175 Stdev:(0.000170) with: {'bow__ngram_range': (1, 2), 'classifier__alpha': 0.01, 'tfidf__use_idf': False}
Mean: 0.769188 Stdev:(0.001243) with: {'bow__ngram_range': (1, 2), 'classifier__alpha': 0.001, 'tfidf__use_idf': True}
Mean: 0.785212 Stdev:(0.000643) with: {'bow__ngram_range'

In [42]:
# save best model to current working directory
joblib.dump(grid, "item_description_sentiment.pkl")

['item_description_sentiment.pkl']

#### Accuracy Score and Confusion Matrix

In [43]:
# load from file and predict using the best configs found in the CV step
model_NB = joblib.load("item_description_sentiment.pkl" )

# get predictions from best model above
y_preds = model_NB.predict(X_test)

print('accuracy score: ',accuracy_score(y_test, y_preds))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds))
print('\n')
print(classification_report(y_test, y_preds))

accuracy score:  0.79595


confusion matrix: 
 [[12038  1723]
 [ 2358  3881]]


              precision    recall  f1-score   support

           0       0.84      0.87      0.86     13761
           1       0.69      0.62      0.66      6239

   micro avg       0.80      0.80      0.80     20000
   macro avg       0.76      0.75      0.76     20000
weighted avg       0.79      0.80      0.79     20000



### Linear Support Vector Classifier

In [52]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(dsp['item_description'][:80000], dsp['binary item condition'][:80000], test_size=0.2)

text_clf_SVC = Pipeline([('bow', CountVectorizer(strip_accents='ascii',stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('classifier', SVC(kernel='linear')),
])
text_clf_SVC.fit(X_train, y_train)
print('Done!')

Done!


#### Accuracy Score and Confusion Matrix

In [53]:
# get predictions from best model above
y_preds_SVC = text_clf_SVC.predict(X_test)

print('accuracy score: ',accuracy_score(y_test, y_preds_SVC))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds_SVC))
print('\n')
print(classification_report(y_test, y_preds_SVC))

accuracy score:  0.805125


confusion matrix: 
 [[9928 1092]
 [2026 2954]]


              precision    recall  f1-score   support

           0       0.83      0.90      0.86     11020
           1       0.73      0.59      0.65      4980

   micro avg       0.81      0.81      0.81     16000
   macro avg       0.78      0.75      0.76     16000
weighted avg       0.80      0.81      0.80     16000

