**Importing Libraries**

In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
#from flair.models import TextClassifier
#from flair.data import Sentence
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
'''
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
'''
from wordcloud import WordCloud
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
import warnings
warnings.filterwarnings("ignore")

****

**Importing Dataset**

In [8]:
data = pd.read_csv('../input/firetvstick/final4k.csv', sep = ',')
df = data.copy()
data=data.dropna()
data.head()

**Cleaning The Dataset**

In [9]:
'''
stop_words = set(stopwords.words("english"))
df["Review"] = df["Review"].str.replace("\d","")

def cleaner(data):
    # Tokens
    tokens = word_tokenize(str(data).replace("'", "").lower())
    # Remove Puncs
    without_punc = [w for w in tokens if w.isalpha()]
    # Stopwords
    without_sw = [t for t in without_punc if t not in stop_words]
    # Lemmatize
    text_len = [WordNetLemmatizer().lemmatize(t) for t in without_sw]
    # Stem
    text_cleaned = [PorterStemmer().stem(w) for w in text_len]
    return " ".join(text_cleaned)

df["Review"] = df["Review"].apply(cleaner)
df["Review"]
'''
'''
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

df['Review']=df['Review'].apply(lambda x: remove_punctuation(x))
#df.head()

def tokenize(text):
    split=re.split("\W+",text) 
    return split

df['Review']=df['Review'].apply(lambda x: tokenize(x.lower()))
#df.head()
stopword = nltk.corpus.stopwords.words('english')
print(stopword[:11])

def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text
df['Review'] = df['Review'].apply(lambda x: remove_stopwords(x))

df.head()
'''

**Assigning independent and dependent variables**

In [10]:
X=data['Review'].values
Y=data['flair_sentiment']
#Y=data['Vader'].values
#Y=data['TextBlob_Polarity'].values
#Y=data['flair_sentiment'].values
#Y = data['Suggestion/Complaint'].values
print(X.shape)
print(Y.shape)

**Split the dataset/ Bag of Words**

In [11]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.35)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

**Apply Bag of words Features on Splitted data**

In [12]:
Vect=CountVectorizer()
Bow_train=Vect.fit_transform(X_train)
Bow_test=Vect.transform(X_test)
print(Bow_train.shape,Y_train.shape)
print(Bow_test.shape,Y_test.shape)

**Decision Tree and Training**

In [13]:
model=DecisionTreeClassifier(max_depth=6, class_weight='balanced')
    
model.fit(Bow_train, Y_train)
predict=model.predict(Bow_test)

conf_mat = confusion_matrix(Y_test, predict)
class_label = ["Negative", "Positive"]
df = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
    
print ("Accuracy : ",accuracy_score(Y_test,predict)*100)

report=classification_report(Y_test,predict)
print(report)
    
sns.set()
sns.heatmap(df, annot = True,fmt="d")
plt.title("Test_Confusion_Matrix")
plt.xlabel("Predicted_Label")
plt.ylabel("Actual_Label")
plt.show()


**Naive Bayes ML Algorithm**

In [14]:
sc = StandardScaler(with_mean=False)
X_train_sc = sc.fit(Bow_train)
X_Train = X_train_sc.transform(Bow_train)
X_Test = X_train_sc.transform(Bow_test)

# Training the Naive Bayes model on the Training set
classifier = GaussianNB()
X_Train = X_Train.toarray()
classifier.fit(X_Train, Y_train)

# Predicting the Test set results
X_Test = X_Test.toarray()
y_pred = classifier.predict(X_Test)

# Making the Confusion Matrix
ac = accuracy_score(Y_test,y_pred)
cm = confusion_matrix(Y_test, y_pred)

class_label = ["Negative", "Positive"]
df = pd.DataFrame(cm, index = class_label, columns = class_label)
    
print ("Accuracy : ",accuracy_score(Y_test, y_pred)*100)

report=classification_report(Y_test, y_pred)
print(report) 
    
sns.set()
sns.heatmap(df, annot = True,fmt="d")
plt.title("Test_Confusion_Matrix")
plt.xlabel("Predicted_Label")
plt.ylabel("Actual_Label")
plt.show()


**SVM Machine Learning Algorithm**

In [15]:
st_x= StandardScaler(with_mean=False)   
x_train= st_x.fit_transform(Bow_train)    
x_test= st_x.transform(Bow_test)

#Training the SVM model
classifier = SVC(kernel='linear', random_state=0)  
classifier.fit(x_train, Y_train)

#Predicting the Test results
y_pred = classifier.predict(x_test)

#Making the confusion matrix
ac = accuracy_score(Y_test,y_pred)
cm= confusion_matrix(Y_test, y_pred)

class_label = ["Negative", "Positive"]
df = pd.DataFrame(cm, index = class_label, columns = class_label)

print ("Accuracy : ",accuracy_score(Y_test, y_pred)*100)

report=classification_report(Y_test, y_pred)
print(report) 
    
sns.set()
sns.heatmap(df, annot = True,fmt="d")
plt.title("Test_Confusion_Matrix")
plt.xlabel("Predicted_Label")
plt.ylabel("Actual_Label")
plt.show()

**XG Boosting**

In [16]:
from xgboost import XGBClassifier

st_x= StandardScaler(with_mean=False)   
x_train= st_x.fit_transform(Bow_train)    
x_test= st_x.transform(Bow_test)


# fit model no training data
model = XGBClassifier()
model.fit(x_train, Y_train)

#Predicting the Test results
y_pred = model.predict(x_test)

#Making the confusion matrix
ac = accuracy_score(Y_test,y_pred)
cm= confusion_matrix(Y_test, y_pred)

class_label = ["Negative", "Positive"]
df = pd.DataFrame(cm, index = class_label, columns = class_label)

print ("Accuracy : ",accuracy_score(Y_test, y_pred)*100)

report=classification_report(Y_test, y_pred)
print(report) 
    
sns.set()
sns.heatmap(df, annot = True,fmt="d")
plt.title("Test_Confusion_Matrix")
plt.xlabel("Predicted_Label")
plt.ylabel("Actual_Label")
plt.show()

**Bag Of Words + XG Boosting**

In [17]:
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

st_x= StandardScaler(with_mean=False)   
x_train= st_x.fit_transform(Bow_train)    
x_test= st_x.transform(Bow_test)

xgb = XGBClassifier()
model = BaggingClassifier(base_estimator=xgb, n_estimators=31, random_state=314)
# fit model to training data
model.fit(x_train, Y_train)

#Predicting the Test results
y_pred = model.predict(x_test)

#Making the confusion matrix
ac = accuracy_score(Y_test,y_pred)
cm= confusion_matrix(Y_test, y_pred)

class_label = ["Negative", "Positive"]
df = pd.DataFrame(cm, index = class_label, columns = class_label)

print ("Accuracy : ",accuracy_score(Y_test, y_pred)*100)

report=classification_report(Y_test, y_pred)
print(report) 
    
sns.set()
sns.heatmap(df, annot = True,fmt="d")
plt.title("Test_Confusion_Matrix")
plt.xlabel("Predicted_Label")
plt.ylabel("Actual_Label")
plt.show()

**Using Hyper Parameters on SVD Model**

In [18]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(x_train, Y_train)

y_pred = grid.predict(x_test)
 
#Making the confusion matrix
ac = accuracy_score(Y_test,y_pred)
cm= confusion_matrix(Y_test, y_pred)

class_label = ["Negative", "Positive"]
df = pd.DataFrame(cm, index = class_label, columns = class_label)

print ("Accuracy : ",accuracy_score(Y_test, y_pred)*100)

report=classification_report(Y_test, y_pred)
print(report) 
    
sns.set()
sns.heatmap(df, annot = True,fmt="d")
plt.title("Test_Confusion_Matrix")
plt.xlabel("Predicted_Label")
plt.ylabel("Actual_Label")
plt.show()

**Bagging On SVD**

In [19]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

#import hasy_tools 

# Define model
#svm = LinearSVC(random_state=42)
svm = SVC(kernel='linear', random_state=0) 
model = BaggingClassifier(base_estimator=svm, n_estimators=31, random_state=314)
#dectree = DecisionTreeClassifier(max_depth=6, class_weight='balanced')
#model = BaggingClassifier(base_estimator=dectree, n_estimators=31, random_state=314)
# defining parameter range
'''
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
model = BaggingClassifier(base_estimator=grid, n_estimators=3, random_state=314)
'''

# Fit
model.fit(x_train, Y_train)

#Prediction
y_pred = model.predict(x_test)
 
#Making the confusion matrix
ac = accuracy_score(Y_test,y_pred)
cm= confusion_matrix(Y_test, y_pred)

class_label = ["Negative", "Positive"]
df = pd.DataFrame(cm, index = class_label, columns = class_label)

print ("Accuracy : ",accuracy_score(Y_test, y_pred)*100)

report=classification_report(Y_test, y_pred)
print(report) 
    
sns.set()
sns.heatmap(df, annot = True,fmt="d")
plt.title("Test_Confusion_Matrix")
plt.xlabel("Predicted_Label")
plt.ylabel("Actual_Label")
plt.show()