# Import libraries

In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [21]:
class results():
    
    def __init__(self,train_data,model):
        
        #initialzing the data to be modelled
        self.data = pd.read_pickle(train_data)
        
        #assigning model variable
        self.model = model
    
    def train_test_splits(self):
        
        #assigning variables for modelling
        x = self.data['tweet']
        
        #assigning variables for modelling
        y = self.data.iloc[:,1]
        
        #splitting the dataset into training and testing data sets
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)
        
        return x_train, x_test, y_train, y_test
    
    def vectorization_standardization(self): 
        
        x_train, x_test, y_train, y_test = self.train_test_splits()
        
        #Bag of words vectorizer
        vectorizer = CountVectorizer()
        x_train = vectorizer.fit_transform(x_train)
        x_test = vectorizer.transform(x_test)

        #standardization
        scaler = StandardScaler(with_mean=False,with_std=False)
        x_train = scaler.fit_transform(x_train) 
        x_test = scaler.fit_transform(x_test)
        
        return x_train,y_train,x_test,y_test
    
    def Classification(self):
        
        if self.model == 'randomforestclassifier':
            #assigning values to be used in modelling
            x_train,y_train,x_test,y_test = self.vectorization_standardization()
            
            #classifier to be used
            classifier = RandomForestClassifier()
            classifier.fit(x_train,y_train)
            
            #Predciting class for x_test with classsifier model
            y_pred = classifier.predict(x_test)

            print("Training Accuracy :\n{}".format(classifier.score(x_train, y_train)))
            print("Validation Accuracy :\n{}".format(classifier.score(x_test, y_test)))

            # calculating the f1 score for the validation set
            print("F1 score :\n{}", f1_score(y_test, y_pred))
            
        if self.model == 'logisticregression':
            
            #assigning values to be used in modelling
            x_train,y_train,x_test,y_test = self.vectorization_standardization()
            
            #classifier to be used
            classifier = LogisticRegression()
            classifier.fit(x_train,y_train)
            
            #Predciting class for x_test with classsifier model
            y_pred = classifier.predict(x_test)

            print("Training Accuracy :\n{}".format(classifier.score(x_train, y_train)))
            print("Validation Accuracy :\n{}".format(classifier.score(x_test, y_test)))

            # calculating the f1 score for the validation set
            print("F1 score :\n{}", f1_score(y_test, y_pred))
            
        if self.model == 'decisiontree':
            #assigning values to be used in modelling
            x_train,y_train,x_test,y_test = self.vectorization_standardization()
            
            #classifier to be used
            classifier = DecisionTreeClassifier()
            classifier.fit(x_train,y_train)
            
            #Predciting class for x_test with classsifier model
            y_pred = classifier.predict(x_test)

            print("Training Accuracy :\n{}".format(classifier.score(x_train, y_train)))
            print("Validation Accuracy :\n{}".format(classifier.score(x_test, y_test)))

            # calculating the f1 score for the validation set
            print("F1 score :\n{}", f1_score(y_test, y_pred))  
            
        if self.model == 'svc':
            #assigning values to be used in modelling
            x_train,y_train,x_test,y_test = self.vectorization_standardization()
            
            #classifier to be used
            classifier = SVC()
            classifier.fit(x_train,y_train)
            
            #Predciting class for x_test with classsifier model
            y_pred = classifier.predict(x_test)

            print("Training Accuracy :\n{}".format(classifier.score(x_train, y_train)))
            print("Validation Accuracy :\n{}".format(classifier.score(x_test, y_test)))

            # calculating the f1 score for the validation set
            print("F1 score :\n{}", f1_score(y_test, y_pred))
            
        if self.model == 'Xgboost':
            #assigning values to be used in modelling
            x_train,y_train,x_test,y_test = self.vectorization_standardization()
            
            #classifier to be used
            classifier = XGBClassifier()
            classifier.fit(x_train,y_train)
            
            #Predciting class for x_test with classsifier model
            y_pred = classifier.predict(x_test)

            print("Training Accuracy :\n{}".format(classifier.score(x_train, y_train)))
            print("Validation Accuracy :\n{}".format(classifier.score(x_test, y_test)))

            # calculating the f1 score for the validation set
            print("F1 score :\n{}", f1_score(y_test, y_pred))