In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate
import re
import pickle
import os

print("Setup complete.")

Setup complete.


In [2]:
sentiment_data = pd.read_csv("../data/airline_sentiment_analysis.csv").drop(labels=["Unnamed: 0"], axis=1)
sentiment_data.head()



Unnamed: 0,airline_sentiment,text
0,positive,@VirginAmerica plus you've added commercials t...
1,negative,@VirginAmerica it's really aggressive to blast...
2,negative,@VirginAmerica and it's a really big bad thing...
3,negative,@VirginAmerica seriously would pay $30 a fligh...
4,positive,"@VirginAmerica yes, nearly every time I fly VX..."


## Exploring the Data

In [3]:
print("Positive comments = ", sentiment_data[sentiment_data["airline_sentiment"] == "positive"].count())
print("Negative comments = ", sentiment_data[sentiment_data["airline_sentiment"] == "negative"].count())

Positive comments =  airline_sentiment    2363
text                 2363
dtype: int64
Negative comments =  airline_sentiment    9178
text                 9178
dtype: int64


Hence we can see that there is a huge bias here- the number of negative comments is far more than the positive ones. So we sample out 2363 from them. 

## Preprocess the data

In [4]:
from typing import Dict


def remove_abbreviations(sentence: str) -> str:
    abb_dict: Dict[str, str] = {
        "i'm": "i am",
        "i've": "i have",
        "i'd": "i would",
        "i'll": "i will",
        "he's": "he is",
        "he'd": "he would",
        "he'll": "he will",
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd've": "he would have",
        "he'll've": "he will have",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd've": "i would have",
        "i'll've": "i will have",
        "i'm'a": "i am about to",
        "i'm'o": "i am going to",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you're": "you are",
        "you've": "you have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "'em": "them",
        "wanna": "want to",
        "gonna": "going to",
        "gotta": "got to",
        "lemme": "let me",
        "more'n": "more than",
        "'bout": "about",
        "'til": "until",
        "kinda": "kind of",
        "sorta": "sort of",
        "lotta": "lot of",
        "aught": "ought",
        "methinks": "me thinks",
        "methinks": "me thinks",
        "o'er": "over",
        "tis": "it is",
        "tisn't": "it is not",
        "twas": "it was",
        "twasn't": "it was not",
        "wot": "what",
        "wotcha": "what are you",
        "it's": "it is",
        "you've": "you have",
        "we've": "we have",
        "they've": "they have",
        "i've": "i have",
    }

    for abb, full in abb_dict.items():
        sentence = sentence.replace(abb, full)
    
    return sentence

def preprocess_sentence(sentence: str) -> str:
    sentence = sentence.lower()
    sentence = remove_abbreviations(sentence)
    sentence = re.sub(r'\W', ' ', str(sentence))

    # remove all single characters
    sentence= re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)

    # Remove single characters from the start
    sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence) 

    # Substituting multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)

    # Removing prefixed 'b'
    sentence = re.sub(r'^b\s+', '', sentence)

    # Converting to Lowercase
    sentence = sentence.lower()

    return sentence

In [5]:
sentiment_data["text"] = sentiment_data["text"].apply(preprocess_sentence)
sentiment_data.head()

Unnamed: 0,airline_sentiment,text
0,positive,virginamerica plus you have added commercials...
1,negative,virginamerica it is really aggressive to blas...
2,negative,virginamerica and it is really big bad thing ...
3,negative,virginamerica seriously would pay 30 flight f...
4,positive,virginamerica yes nearly every time fly vx th...


In [6]:
vectorizer = None

def get_tf_idf(sentences: pd.Series) -> pd.DataFrame:
    global vectorizer
    print("Input type =", type(sentences))
    vectorizer = TfidfVectorizer()
    vectorizer.fit(sentences)
    response = vectorizer.transform(sentences).toarray()
    print("Output Type = ", type(response))
    return response

In [7]:
result = get_tf_idf(sentiment_data["text"])

Input type = <class 'pandas.core.series.Series'>
Output Type =  <class 'numpy.ndarray'>


In [8]:
print(result)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
df2 = sentiment_data.copy()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(result, sentiment_data["airline_sentiment"], test_size=0.2, random_state=0)


In [18]:
text_classifier = RandomForestClassifier(random_state=42)
text_classifier.fit(X_train, y_train)
text_classifier.fit(X_train, y_train)
# cv_results = cross_validate(text_classifier, X_test, y_test, cv=5)

print(cv_results)

In [13]:
# predictions = text_classifier.predict(X_test)


In [17]:
from typing import List


def train_data_iter(model, class_sampled_data: List[pd.DataFrame]=[]):
    dataset_sizes = [len(class_sampled_data[i]) for i in range(len(class_sampled_data))]
    print(dataset_sizes)
    min_len = min(dataset_sizes)
    print(min_len)
    slice_sizes = [(x//(x // min_len + (x % min_len) // min_len)) for x in dataset_sizes]
    print(slice_sizes)
    slices_counts = [(x // min_len + (x % min_len) // min_len) for x in dataset_sizes]
    print(slices_counts)
    max_iters = max(slices_counts)

    for i in range(max_iters):
        model.set_params(n_estimators=200, warm_start=True)
        training_data = pd.concat([class_sampled_data[i].sample(slice_sizes[i]) for i in range(len(class_sampled_data))], axis=0, ignore_index=True)

        model.fit([training_data["text"]], [training_data["airline_sentiment"]])

    return model

df2 = sentiment_data.copy()
df2["text"] = result
model = train_data_iter(RandomForestClassifier(warm_start=True, random_state=42, n_estimators=200), [df2[df2["airline_sentiment"] == "positive"], df2[df2["airline_sentiment"] == "negative"]])

[2363, 9178]
2363
[2363, 3059]
[1, 3]


  warn(
  warn(


In [16]:

predictions = model.predict(get_tf_idf(sentiment_data["text"]))

print(confusion_matrix(sentiment_data["airline_sentiment"],predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

Input type = <class 'pandas.core.series.Series'>
Output Type =  <class 'numpy.ndarray'>


ValueError: X has 12637 features, but RandomForestClassifier is expecting 5422 features as input.

In [None]:
predictions

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [None]:
from pathlib import Path
BASE_DIR = Path(os.path.abspath(".")).parent.absolute()
print(BASE_DIR)
def save_model(loc: str = f"{BASE_DIR}/backend/api/models/scikit_learn/weights/classifier/best.pkl"):
    with open(loc, "wb") as fout:
        pickle.dump(text_classifier, fout)

/media/anuran/Samsung SSD 970 EVO 1TB/Internship/TrueFoundry/Internship Task


In [None]:
with open(f"{BASE_DIR}/backend/api/models/scikit_learn/weights/vectorizer/best.pkl", "wb") as fout:
    pickle.dump(vectorizer, fout)

In [None]:
save_model()