<a href="https://colab.research.google.com/github/arzoozehra/CIND820/blob/main/hypertuned_SVC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#!pip install pyspellchecker
#from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from clean_data import *

ModuleNotFoundError: ignored

**Load data**

In [2]:
url = 'https://raw.githubusercontent.com/arzoozehra/CIND820/main/data/train.csv'
train = pd.read_csv(url)
test = pd.read_csv('https://raw.githubusercontent.com/arzoozehra/CIND820/main/data/test.csv')

# Remove row with missing values
train.dropna(inplace=True)

In [3]:
print(train["text"].head(10))
print(train["text"].tail(10))

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
5    http://www.dothebouncy.com/smf - some shameles...
6    2am feedings for the baby are fun when he is a...
7                                           Soooo high
8                                          Both of you
9     Journey!? Wow... u just became cooler.  hehe....
Name: text, dtype: object
27471    i`m defying gravity. and nobody in alll of oz,...
27472    http://twitpic.com/663vr - Wanted to visit the...
27473     in spoke to you yesterday and u didnt respond...
27474    So I get up early and I feel good about the da...
27475                                       enjoy ur night
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Y

**Clean data**

In [4]:
#Clean trraining data
train = clean_data(train)

#Clean testing data
test = clean_data(test)

In [6]:
print(train["text"].head(10))
print(train["text"].tail(10))

0                          would respond go
1                   sooo sad miss san diego
2                                 bos bulli
3                       interview leav alon
4       son could put releas alreadi bought
5    shameless plug best ranger forum earth
6                   feed babi fun smile coo
7                                soooo high
8                                          
9     journey wow becam cooler hehe possibl
Name: text, dtype: object
27471        defi graviti nobodi alll wizard ever go bring
27472                                 want visit anim late
27473           spoke yesterday respond girl wassup though
27474    get earli feel good day walk work feel alright...
27475                                          enjoy night
27476    wish could come see denver husband lost job ca...
27477    wonder rake client made clear net forc dev lea...
27478    yay good enjoy break probabl need hectic weeke...
27479                                                worth
274

In [7]:
print(test['text'].head(20))
print(test['text'].tail(20))

0                                      last session day
1     shanghai also realli excit precis skyscrap gal...
2     recess hit veroniqu branquinho quit compani shame
3                                        happi birthday
4                                                  like
5                                    great weee visitor
6                                think everyon hate lol
7        soooooo wish could school myspac complet block
8                           within short time last clue
9     get day alright done anyth yet leav soon steps...
10                bike put hold known argh total bummer
11                                            check win
12                             twitter tavern bore much
13    weekend youngest son turn tomorrow make kind s...
14          come socket feel like phone hole virgin loo
15               hot today like hate new timet bad week
16                                                 miss
17                                              


**Feature Selection**

In [8]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)  # Use 1-grams + 2-grams.

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
TOKEN_MODE = 'word' # Split text into word tokens.

# Minimum document frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 5

def ngram_vectorize(train_texts, train_labels, test_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        test_texts: list, test text strings.

    # Returns
        train_vectors, test_vectors: vectorized training and test texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,
            'analyzer': TOKEN_MODE,  
            'min_df': MIN_DOCUMENT_FREQUENCY,
            'max_df' : 0.8,
            'sublinear_tf': 'True'
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    train_vectors = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    test_vectors = vectorizer.transform(test_texts)

    # # Select top 'k' of the vectorized features.
    # selector = SelectKBest(f_classif, k=min(TOP_K, train_vectors.shape[1]))
    # selector.fit(train_vectors, train_labels)
    # train_vectors = selector.transform(train_vectors).astype('float32').toarray()
    # test_vectors = selector.transform(test_vectors).astype('float32').toarray()
    return train_vectors, test_vectors


In [None]:
# # Create feature vectors
# vectorizer = TfidfVectorizer(ngram_range = NGRAM_RANGE,
#                              min_df = 5,
#                              max_df = 0.8,
#                              sublinear_tf = True,
#                              use_idf = True)
# train_vectors = vectorizer.fit_transform(train['text'])
# test_vectors = vectorizer.transform(test['text'])


train_vectors, test_vectors = ngram_vectorize(train['text'], train['sentiment'], test['text'])

In [9]:
print(np.shape(train_vectors))
print(np.shape(test_vectors))

(27480, 5713)
(3534, 5713)


**SVC (linear kernel)**

In [None]:
model = SVC(kernel='linear')
model_name = "SVC (linear kernel)"

# 5-fold Cross-validation
k = 5
cv_df = pd.DataFrame(index=range(k))
entries = []

accuracies = cross_val_score(model, train_vectors, train['sentiment'], scoring="accuracy", cv=k)
for fold_id, accuracy in enumerate(accuracies):
  entries.append((fold_id, accuracy))

cv_df = pd.DataFrame(entries, columns=['fold_id', 'accuracy'])

In [None]:
mean_accuracy = cv_df.accuracy.mean()
std_accuracy = cv_df.accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Accuracy', ' Std dev']
acc

In [None]:
model.fit(train_vectors, train['sentiment'])
prediction = model.predict(test_vectors)
print(f"Test set accuracy: {accuracy_score(test['sentiment'], prediction) * 100} %\n")

In [None]:
# Classification report
print(f"\tCLASSIFICATIION METRICS - {model_name}\n")
print(classification_report(test['sentiment'], prediction, target_names= ['negative', 'neutral', 'positive']))

In [None]:
data = confusion_matrix(test['sentiment'], prediction)
disp = ConfusionMatrixDisplay(confusion_matrix=data, display_labels=model.classes_)
disp.plot(cmap="Blues")
plt.ylabel('ACTUAL')
plt.xlabel('\nPREDICTED')
plt.title("\nCONFUSION MATRIX - SVC (linear kernel)\n");
plt.show()