In [2]:
# coding: utf-8

# # Importing Module
# - pandas for formatting or provides table
# - seaborn is wrapper over matplotlib

# File imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Extract_Functional_Zone import process_data

#get_ipython().magic('matplotlib inline')

In [3]:
# Reading the Pre-Processed File
name = "raw_data.csv"
process_data(name)

df = pd.read_csv("Processed_Data.csv",encoding= "utf-8")
df["Processed"][0]


'gender: female\ndob : apr 20, 1977\nproduct type: permanent\n'

In [4]:
from nltk.corpus import stopwords,wordnet as wn
from nltk.tokenize import wordpunct_tokenize,sent_tokenize
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re


In [5]:
#Removes all punctuations which acts as noise

def rem_punt(doc):
    ans = re.sub('"|\\n|\(|\)|,|\.|[$!--+@#:]',' ',doc)
    ans = re.sub(' +',' ',ans)
    ans = ans.lower()
    return ans


# Stop words removal using tokenization

stop_word = set(stopwords.words('english'))

def tokenize(document): 
    lemmy = []
    for sent in sent_tokenize(document):
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            #print(token,tag)
            if token in stop_word:
                 continue
            lemma = lemmatize(token, tag)
            lemmy.append(lemma)
    return lemmy

#Lemmatization for tokens simplification

def lemmatize(token, tag):
    tag = {
          'N': wn.NOUN,
          'V': wn.VERB,
          'R': wn.ADV,
          'J': wn.ADJ
    }.get(tag[0], wn.NOUN)
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(token, tag)

In [6]:
# In[22]:

df['Lemmitize'] = df['Processed'].apply(rem_punt).apply(tokenize)


df.to_csv('NLPProcessed.csv',index=False, encoding = "utf-8")

In [7]:
# In[25]:

df = pd.read_csv('NLPProcessed.csv')


# # Statistical Modeling 

# In[26]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer,LabelEncoder
from sklearn.metrics import accuracy_score,classification_report


# In[28]:

X = df['Lemmitize']
y = df['Offer_noise_free']
#lab_y = LabelEncoder()
#y = lab_y.fit_transform(y)

In [8]:
X[0]

"['gender', 'female', 'dob', 'apr', '20', '1977', 'product', 'type', 'permanent']"

In [9]:
# In[29]:

X_train,X_test,y_train,y_test = train_test_split(X,y)


# In[30]:

vect = TfidfVectorizer(max_df=0.8, max_features=15000, min_df=0.01, use_idf=True , ngram_range=(1,3))

In [10]:

#from xgboost.sklearn import XGBClassifier
#model1 = XGBClassifier(nthread=4,n_estimators=1000)


# Naive Bayes

from sklearn.naive_bayes import GaussianNB,MultinomialNB
model2 = GaussianNB()


# ExtraTree Classifier

from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
model3 = RandomForestClassifier(n_estimators=600,n_jobs=-1)

# SVM Classifier

from sklearn.svm import SVC
model4 = SVC()


# Logistic Regression 

from sklearn.linear_model import LinearRegression,SGDClassifier,LogisticRegression
model5 = LogisticRegression()

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model6 = LinearDiscriminantAnalysis()
'''
import matplotlib.pyplot as plt 
%matplotlib inline
plt.spy(vect)
'''

'\nimport matplotlib.pyplot as plt \n%matplotlib inline\nplt.spy(vect)\n'

In [11]:
# Model Fitting

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import time

name = [] 
results = []
matrix_confusion = []
training_time = []
prediction_time = []
def model_making(model_name, vect , model , X_train , y_train , X_test , y_test):
    
    t1 =time.time()
    clf = make_pipeline(vect,model)
    clf.fit(X_train,y_train)
    t2 = time.time()
    training_time.append(t2-t1)
    
    t1 = time.time()
    pd = clf.predict(X_test)
    t2 = time.time()
    prediction_time.append(t2-t1)
    
    y_pred = clf.predict(X_test)
    name.append(model_name)
    results.append(accuracy_score(y_test, y_pred)*100)
    matrix_confusion.append(confusion_matrix(y_test, y_pred))
    
    #print ("=====Accuracy Score ", "{0:.2f}".format(accuracy_score(y_test, y_pred)*100), "%")
    #print ("=====Confusion Matrix")
    #print (confusion_matrix(y_test, y_pred))
    #target_names = ['class 0', 'class 1', 'class 2']
    #print(classification_report(y_test, y_pred, target_names=target_names))
    

In [12]:
model_making("Random Forest",vect, model3, X_train, y_train, X_test, y_test)

In [13]:
model_making("SVM" , vect, model4, X_train, y_train, X_test, y_test)

In [14]:
model_making("Logistic Regression",vect, model5, X_train, y_train, X_test, y_test)

In [15]:
#model_making("Naive Bayes", vect , model2 , X_train, y_train, X_test, y_test )

In [16]:
print("Total dataset",len(X))
print("Training dataset: ",len(X_train))
print("Testing dataset: ",len(X_test),"\n")
print("{:20} {:^20} {:^20} {:^20}\n ".format("Name" , "Accuracy" , "Training Time(s)" , "Prediction Time(s)" ) )

for i in range(len(name)):
    print("{:20} {:^20.3f} {:^20.3f} {:20.3f}s \n ".format(name[i] , results[i] , training_time[i] , prediction_time[i] ) )

Total dataset 6
Training dataset:  4
Testing dataset:  2 

Name                       Accuracy         Training Time(s)    Prediction Time(s) 
 
Random Forest               0.000                2.208                        0.731s 
 
SVM                         0.000                0.014                        0.002s 
 
Logistic Regression         0.000                0.317                        0.003s 
 


In [17]:
X_test


1    ['gender', 'male', 'dob', '10', '/', '0x', '/'...
2    ['male', 'xx', '/', '31', '/', 'xx', '5', '’',...
Name: Lemmitize, dtype: object