In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
final=pd.read_csv('/content/drive/MyDrive/airline_sentiment_analysis.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import re
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib

In [None]:
class Model:
    def __init__(self, datafile = '/content/drive/MyDrive/airline_sentiment_analysis.csv'):
        self.df = pd.read_csv(datafile)
        self.clf = SVC()
        self.cleaned_df=None
        self.tfidf_test_vectors=None
    def preprocess(self):
      self.df = self.df.drop('Unnamed: 0', axis = 1) 
      X = self.df['text']
      y = self.df['airline_sentiment']
      text = np.array(self.df['text'])
      text = text.flatten()
      for i in range(len(text)):
        text[i] = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
                            '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text[i])
        text[i] = re.sub("(@[A-Za-z0-9_]+)", "", text[i])
        text[i] = re.sub("(#[A-Za-z0-9_]+)", "", text[i])
      text = np.array(text)
      X = pd.DataFrame(text, columns=['text'])
      self.cleaned_df=X

    def split(self, test_size):
        
        y = np.array(self.df['airline_sentiment'])

        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.cleaned_df, y, test_size = test_size, random_state = 42)
    
    
    def fit(self):
        tfidf_vectorizer = TfidfVectorizer() 

        tfidf_train_vectors = tfidf_vectorizer.fit_transform(np.array(self.X_train).flatten())
        joblib.dump(tfidf_vectorizer.vocabulary_,open("feature.pkl","wb"))
        self.tfidf_test_vectors = tfidf_vectorizer.transform(np.array(self.X_test).flatten())
        self.model = self.clf.fit(tfidf_train_vectors, self.y_train)
    
    def predict(self):
        
        
        result = self.model.predict(self.tfidf_test_vectors)
        return result


model_instance = Model()
model_instance.preprocess()
model_instance.split(0.2)

model_instance.fit()   
y_pred = model_instance.predict()
y_test = model_instance.y_test
print(accuracy_score(y_test, y_pred))


0.9142485924642703


In [None]:
from sklearn.utils.multiclass import check_classification_targets
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.91      0.99      0.95      1862
    positive       0.92      0.61      0.73       447

    accuracy                           0.91      2309
   macro avg       0.92      0.80      0.84      2309
weighted avg       0.91      0.91      0.91      2309

[[1839   23]
 [ 175  272]]
