<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#First-Baseline-CountVectorizer-Model-with-NB" data-toc-modified-id="First-Baseline-CountVectorizer-Model-with-NB-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>First Baseline CountVectorizer Model with NB</a></span></li></ul></div>

In [19]:
import pandas as pd
import datetime
import pandas_datareader as pdr
from textblob import TextBlob

import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression

import spacy
nlp = spacy.load('en_core_web_sm')


import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
import pickle
df = pickle.load( open( "../data/df_words_target.pkl", "rb" ) )

In [21]:
df.pct_price_target_same_day.value_counts(normalize=True)

1    0.606557
0    0.393443
Name: pct_price_target_same_day, dtype: float64

In [22]:
def tokenizer_lemmatizer (text): 
    '''
    Initializing tokenizer and lemmatizer to handle NLP preprocessing. 
    1. breakdown the word by alphanumeric characters and dollar with number
    2. Create a list that appended with lemmatized posts and rejoin words by one string 
       alongside removing characters and numbers
    '''
    
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    tokens = [tokenizer.tokenize(post.lower()) for post in (df[text])]
    
    
    lemmatizer = WordNetLemmatizer()
    lems = []
    for post in tokens:
        tok_post = []
        for word in post:
            tok_post.append(lemmatizer.lemmatize(word)) #Remove non-letter
            tok_post.append(re.sub("[^a-zA-Z]", "", lemmatizer.lemmatize(word)))

        posts = " ".join(tok_post)
        lems.append(posts)
    
    words_not_used = [ 'reeve musk', 'wa', 've', 'ha', 'don']
    
    lems = [w for w in lems if not w in words_not_used] #stopwords.words('english')
    
    df[text] = lems #overwrite the df
    
    print (f'tokenizer processed: {len(tokens)}')
    print (f'lemmatizer processed: {len(lems)}')
    #return lems

In [23]:
tokenizer_lemmatizer('transcripts')

tokenizer processed: 61
lemmatizer processed: 61


In [24]:
# Text processing for sentence detection, segmentation, and entity detection
# parsed_review = [nlp(df.transcripts[text]) for text in range(len(df.transcripts))]

In [7]:
# token_text = []
# token_lemma = []
# token_pos = [] 
# token_entity = []
# token_is_stop = []


# for i in range(len(df.transcripts)): 
#     token_text.append([token.text.lower() for token in parsed_review[i]])
#     token_lemma.append([token.lemma_.lower() for token in parsed_review[i]])
#     token_pos.append([token.pos_ for token in parsed_review[i]])

#     token_entity.append([token.ent_type_ for token in parsed_review[i]])
#     token_is_stop.append([token.is_stop for token in parsed_review[i]])


In [None]:
# token_attibutes = pd.DataFrame( zip(token_text[0], token_lemma[0], token_pos[0],token_entity[0], token_is_stop[0]),               
#             columns=['token_text','token_lemma','token_pos','token_entity','token_is_stop'])
# token_attibutes

In [None]:
#token_attibutes.token_entity.unique()

In [None]:
#len(token_attibutes.token_entity[token_attibutes.token_entity == 'ORG'])

In [None]:
#len(token_attibutes.token_pos[token_attibutes.token_pos == 'ADV'])

#### First Baseline CountVectorizer Model with NB

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb = MultinomialNB()

X = df['transcripts']
y = df['pct_price_target']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

cvec = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english') 

X_train_cv_2 = cvec.fit_transform(X_train) # must fit transform to numbers for train 
X_test_cv_2 = cvec.transform(X_test)       # must transform to number for test

nb.fit(X_train_cv_2, y_train)
y_pred_class = nb.predict(X_test_cv_2)

# Calculate accuracy.
print((metrics.accuracy_score(y_test, y_pred_class)))

X_train_cv_df = pd.DataFrame(X_train_cv_2.toarray(), columns = cvec.get_feature_names())

0.6875


**MultinomialNB Model with ngram**

In [26]:
# Use default options for CountVectorizer.
vect = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english', ngram_range=(2, 2))


X = df['transcripts']
y = df['pct_price_target']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state=42,
                                                    stratify=y)


# Create document-term matrices.
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# Use Naive Bayes to predict the star rating.

nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# Calculate accuracy.
print((metrics.accuracy_score(y_test, y_pred_class)))

0.5625


**LogisticRegression with Tfidf**

In [32]:
X = df['transcripts']
y = df['pct_price_target']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

tvec = TfidfVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english', ngram_range=(2, 3)) 

X_train_tvec_sw = tvec.fit_transform(X_train)
X_test_tvec_sw = tvec.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tvec_sw, y_train)
y_pred_class = lr.predict(X_test_tvec_sw)


print((metrics.accuracy_score(y_test, y_pred_class)))

0.5625




**Retrieve coefficient and features of importance in trigram**

In [29]:
pd.DataFrame(X_train_tvec_sw.toarray(), columns = tvec.get_feature_names())

Unnamed: 0,00 18 27,00 30 47,00 50 ebit,00 50 range,00 51 wa,00 52 21,00 53 09,00 53 33,00 54 00,00 adjusted adjusted,...,zoom production production,zoom zoom production,zotye 46 50,zotye don don,zotye establish establish,zotye ford ford,zotye zotye 46,zotye zotye don,zotye zotye establish,zotye zotye ford
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
features_strength = pd.DataFrame([tvec.get_feature_names(), lr.coef_[0]], 
                             index=['features_nb','coefficient_nb']).T.sort_values(by='coefficient_nb',ascending=False)
features_strength