In [70]:
import json
import math
import os
import pathlib
import re
from collections import defaultdict
import seaborn
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import textstat
import joblib
from nltk.stem import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
import pickle
import warnings


In [71]:

def flesch_reading_ease(text):
    # formula=206.835-1.015(total_words/1)-84.6(syllables/total_words)
    syllables=textstat.syllable_count(text)
    words=textstat.lexicon_count(text, removepunct=True)
    score=round(206.835-1.015*(words/1)-84.6*(float(syllables/words)),2)
    # print(score)
    return score

def flesch_kincaid_grade_level(text):
    # formula=0.39*(total_words/1)+11.8(syllables/total_words)-15.59
    syllables=textstat.syllable_count(text)
    words=textstat.lexicon_count(text, removepunct=True)
    score=round(0.39*(words/1)+11.8*(syllables/words)-15.59,2)
    # print(score)
    return score

#Remove extra white spaces, urls , mentions
def preprocess(text):
    text=text.lower()   
    # print(stopwords)
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
    '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub('#\S+', '', parsed_text)  # remove hashtags
    parsed_text = re.sub('@\S+', '  ', parsed_text)  # remove mentions
    return parsed_text

def cleaning(text):
    text=preprocess(text)
    return text
def tokenization_with_stemming(text):
    stemmer=PorterStemmer()
    tokens = [stemmer.stem(t) for t in text.split()]
    # print(tokens)
    return tokens

def features(text):
    sentiment_analyzer=VS()
    sentiment = sentiment_analyzer.polarity_scores(text)
    
    words = preprocess(text) #Get text only
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(text)
    num_terms = len(text.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = flesch_kincaid_grade_level(text)
    ##Modified FRE score, where sentence fixed to 1
    FRE = flesch_reading_ease(text)
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound']]
    #features = pandas.DataFrame(features)
    return features


def tokenize(text):
    text = " ".join(re.split("[^a-zA-Z.,!?]*", text.lower())).strip()
    return text.split()

def get_feature_array(text):
    feats=[]
    for t in text:
        feats.append(features(t))
    return np.array(feats)    
tfidf_vector = TfidfVectorizer(
    tokenizer=tokenization_with_stemming,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words='english',
    sublinear_tf=True,
    use_idf=True,
    smooth_idf=True,
    norm=None,
    decode_error='replace',
    max_features=1500,
    lowercase=True,
     token_pattern='[a-zA-Z0-9]+',
     strip_accents='unicode'
    )
pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=1500,lowercase=True,
     token_pattern='[a-zA-Z0-9]+',
     strip_accents='unicode'
    )

other_features_names = ["FKRA", "FRE","num_syllables", "avg_syllables_per_sent", "num_chars", "num_chars_total",
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", 
                        "vader compound"]
stopwords = nltk.corpus.stopwords.words("english")

In [72]:
text=['You are so great','This is not worth a damn','I hate you']
foldername='asset Tweet'
for t in text:
    t=cleaning(t)
# text=tokenization_with_stemming(text)
print(text)
# X=pd.DataFrame()


['You are so great', 'This is not worth a damn', 'I hate you']


In [73]:
# Hatebase Model
trained_model = joblib.load(r'F:\IR_Project\k180198 k180182 k180253 Review Sentiment Analysis\{foldername}\trained_model_logistic'.format(foldername=foldername))
print(trained_model)

LogisticRegression(class_weight='balanced', solver='newton-cg')


In [74]:
with open(r'F:\IR_Project\k180198 k180182 k180253 Review Sentiment Analysis\{foldername}\tfidf'.format(foldername=foldername), 'rb') as tfidf:
  tfidf_vector = pickle.load(tfidf)
with open(r'F:\IR_Project\k180198 k180182 k180253 Review Sentiment Analysis\{foldername}\pos'.format(foldername=foldername), 'rb') as tfidf:
    pos_vector = pickle.load(tfidf)
tfidf=tfidf_vector.transform(text).toarray()
features=get_feature_array(text)

# Get parts of speech tags for text and save as a string
text_tag = []
for t in text:
    tokens = tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    text_tag.append(tag_str)

pos = pos_vector.transform(pd.Series(text_tag)).toarray()

M = np.concatenate([tfidf,pos,features],axis=1)

X=pd.DataFrame(M)
print(X.shape)
# print(tfidf_vector)
# print(pos_vector)

(3, 2399)


In [75]:
print(tfidf_vector)

TfidfVectorizer(decode_error='replace', max_features=1500, ngram_range=(1, 3),
                norm=None,
                preprocessor=<function preprocess at 0x000002A442D92790>,
                stop_words='english', strip_accents='unicode',
                sublinear_tf=True, token_pattern='[a-zA-Z0-9]+',
                tokenizer=<function tokenization_with_stemming at 0x000002A442D92670>)


In [76]:
trained_model.predict(X)

array([2, 1, 0])

In [77]:
#  0 - hate speech
#   1 - offensive  language
#   2 - neither
