In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

In [48]:
dataset= pd.read_csv(r"C:/Users/Ashutosh Arya/Desktop/Ashu/dataset.csv")
dataset.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,pos
1,The best soundtrack ever to anything.: I'm re...,pos
2,Amazing!: This soundtrack is my favorite musi...,pos
3,Excellent Soundtrack: I truly like this sound...,pos
4,"Remember, Pull Your Jaw Off The Floor After H...",pos


In [None]:
"""DATA PREPROCESSING"""

#Remove blank rows if any.
dataset['text'].dropna(inplace=True)

#Change all the text to lower case
dataset['text'] = [str(entry).lower() for entry in dataset['text']]

#Tokenization : In this each entry in the corpus will be broken into set of words
dataset['text']= [word_tokenize(entry) for entry in dataset['text']]

#Remove Stop words, Non alpha words and perfom Word Stemming/Lemmenting.
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(dataset['text']):
    
    Final_words = []
  
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    dataset.loc[index,'text_final'] = str(Final_words)

In [34]:
#Prepare Train and Test Data sets

train_X, test_X, train_Y, test_Y = model_selection.train_test_split(dataset['text_final'],dataset['label'],test_size=0.2,random_state = 42)

In [35]:
#Encoding

Encoder = LabelEncoder()
train_Y = Encoder.fit_transform(train_Y)
test_Y = Encoder.fit_transform(test_Y)

In [36]:
#Word Vectorization

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(dataset['text_final'])
train_X_Tfidf = Tfidf_vect.transform(train_X)
test_X_Tfidf = Tfidf_vect.transform(test_X)

In [40]:
# Classifier - Algorithm - SVM

# fit the training dataset on the classifier
SVM = svm.SVC(C=3.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_X_Tfidf,train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_Y)*100)

SVM Accuracy Score ->  82.85
