# Preprocessing Pipeline

In [34]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math


#Function to convert data into lowercase
def convert_lower_case(data):
    return np.char.lower(data)


#function to remove stopwords using nltk corpus
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text


#This function removes special character/puntuations with space
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data


#This function removes apostrophe
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


#This function uses PorterStemmer for stemming ie to find the root form of the words
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text


#num2words is a library that converts numbers like 42 to words like forty-two.
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text


#This is a preprocessing pipeline which calls all the functions together for cleaning the data.
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

# Text Classification

In [35]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [36]:
# Loading CSV file
df_weed = pd.read_csv ("weed_final.csv")

In [37]:
# Top 5 records
df_weed.head()

Unnamed: 0,Brand,Product,Product Name,Product Description,Customer Name,Rating,Review,Locations
0,00 Seeds,Cultivation,White Widow,"Selección de White Widow, planta compacta de g...",legitplug001,Positive,CALL OR TEXT...310.912.31.45 Got Buds/Edibles/...,No Retailers Found Near You
1,00 Seeds,Cultivation,White Widow,"Selección de White Widow, planta compacta de g...",Manue3,Positive,"I have medical cannabis for sale 0z 250, 1/8lb...",No Retailers Found Near You
2,00 Seeds,Cultivation,White Widow,"Selección de White Widow, planta compacta de g...",autreykenneth,Positive,one of the best strains ihave ever tried,No Retailers Found Near You
3,00 Seeds,Cultivation,White Widow,"Selección de White Widow, planta compacta de g...",RasRojas,Positive,como compro weeed,No Retailers Found Near You
4,00 Seeds,Cultivation,White Widow,"Selección de White Widow, planta compacta de g...",Kathyd47,Positive,Need. Intact info for online or phone call in ...,No Retailers Found Near You


In [38]:
# shape of dataframe
df_weed.shape

(20690, 8)

In [39]:
# View data information
df_weed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20690 entries, 0 to 20689
Data columns (total 8 columns):
Brand                  20690 non-null object
Product                20690 non-null object
Product Name           20690 non-null object
Product Description    20690 non-null object
Customer Name          20690 non-null object
Rating                 20690 non-null object
Review                 20129 non-null object
Locations              20690 non-null object
dtypes: object(8)
memory usage: 1.3+ MB


In [40]:
# Feedback Value count
df_weed.Rating.value_counts()

Positive    18307
Negative     1697
Neutral       686
Name: Rating, dtype: int64

In [41]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [55]:
from sklearn.model_selection import train_test_split



ylabels = df_weed['Rating'] # the labels, or answers, we want to test against

preprocessed_text= []
    
for i in range(len(df_weed)):
    
    preprocessed_text.append(str(preprocess(str(df_weed.loc[i, "Review"]))))
    
X = pd.DataFrame(preprocessed_text)

X_train, X_test, y_train, y_test = train_test_split(X[0], ylabels, test_size=0.3)

0        CALL OR TEXT...310.912.31.45 Got Buds/Edibles/...
1        I have medical cannabis for sale 0z 250, 1/8lb...
2                 one of the best strains ihave ever tried
3                                        como compro weeed
4        Need. Intact info for online or phone call in ...
                               ...                        
20685    Not too strong or weak, great flavor combinati...
20686    Nice gooey and chewy brownies. Tasted great. A...
20687    Birthday Cake: 1600 Ice Cream Cake: 1700 Grape...
20688                               Works well and lasts !
20689    Buy medical cannabis, wax, shatter, rick simps...
Name: Review, Length: 20690, dtype: object

In [56]:
# Logistic Regression Classifier
from sklearn.svm import LinearSVC
model = LinearSVC()


In [57]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle

count_vect = CountVectorizer(tokenizer = spacy_tokenizer)
X_train_counts = count_vect.fit_transform(X_train.values.astype('U'))
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = model.fit(X_train_tfidf, y_train)


print(clf.predict(count_vect.transform(["This I dont like this product it is bad"])))
y_pred=clf.predict(count_vect.transform(X_test.values.astype('U')))
pickle.dump(clf, open('modelclfsvm.pkl','wb'))
pickle.dump(count_vect, open('count_vect_svm', 'wb'))

['Negative']


In [58]:
from sklearn.metrics import classification_report, confusion_matrix

print("-----CONFUSTION MATRIX-----")

print(confusion_matrix(y_test, y_pred))

print("-----CLASSIFICATION REPORT-----")

print(classification_report(y_test,y_pred))

-----CONFUSTION MATRIX-----
[[ 351   47   96]
 [  57   63   81]
 [ 337  275 4900]]
-----CLASSIFICATION REPORT-----
              precision    recall  f1-score   support

    Negative       0.47      0.71      0.57       494
     Neutral       0.16      0.31      0.22       201
    Positive       0.97      0.89      0.93      5512

    accuracy                           0.86      6207
   macro avg       0.53      0.64      0.57      6207
weighted avg       0.90      0.86      0.87      6207



# Twitter Sentimental Analysis

In [68]:
# Loading CSV file
df_twitter = pd.read_csv ("sentiment3.csv")

In [69]:
from sklearn.model_selection import train_test_split



ytlabels = df_twitter['Sentiment'] # the labels, or answers, we want to test against

preprocessed_text= []
    
for i in range(len(df_twitter)):
    
    preprocessed_text.append(str(preprocess(str(df_twitter.loc[i, "Tweet"]))))
    
Xt = pd.DataFrame(preprocessed_text)

Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt[0], ytlabels, test_size=0.3)

In [70]:
# Logistic Regression Classifier
from sklearn.svm import LinearSVC
model = LinearSVC()

In [72]:
count_vect = CountVectorizer(tokenizer = spacy_tokenizer)
X_train_counts = count_vect.fit_transform(Xt_train.values.astype('U'))
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = model.fit(X_train_tfidf, yt_train)


print(clf.predict(count_vect.transform(["This I like this weed"])))
y_pred=clf.predict(count_vect.transform(Xt_test.values.astype('U')))
pickle.dump(clf, open('modelclfsvm.pkl','wb'))
pickle.dump(count_vect, open('count_vect_svm', 'wb'))

['positive']


In [74]:
from sklearn.metrics import classification_report, confusion_matrix

print("-----CONFUSTION MATRIX-----")

print(confusion_matrix(yt_test, y_pred))

print("-----CLASSIFICATION REPORT-----")

print(classification_report(yt_test,y_pred))

-----CONFUSTION MATRIX-----
[[  7  14   1]
 [  1 160   1]
 [  0  70  46]]
-----CLASSIFICATION REPORT-----
              precision    recall  f1-score   support

    negative       0.88      0.32      0.47        22
     neutral       0.66      0.99      0.79       162
    positive       0.96      0.40      0.56       116

    accuracy                           0.71       300
   macro avg       0.83      0.57      0.61       300
weighted avg       0.79      0.71      0.68       300

