Q1

In [24]:
# Importing the libraries
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# To count the iterations
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
# Importing the dataset
dataset = pd.read_csv('datasets/Reviews.csv')

In [26]:
# Dropping the dups in dataset
dataset = dataset.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)

In [27]:
def removeHTMLTags(review):
    soup = BeautifulSoup(review, 'lxml')
    return soup.get_text()

In [28]:
def removeApostrophe(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase

In [29]:
def removeAlphaNumericWords(review):
     return re.sub("\S*\d\S*", "", review).strip()

  return re.sub("\S*\d\S*", "", review).strip()


In [30]:
def removeSpecialChars(review):
     return re.sub('[^a-zA-Z]', ' ', review)

In [31]:
def scorePartition(x):
    if x < 3:
        return 0
    return 1

In [32]:
def doTextCleaning(review):
    review = removeHTMLTags(review)
    review = removeApostrophe(review)
    review = removeAlphaNumericWords(review)
    review = removeSpecialChars(review)
    # Lower casing
    review = review.lower()
    #Tokenization
    review = review.split()
    #Removing Stopwords and Lemmatization
    lmtzr = WordNetLemmatizer()
    review = [lmtzr.lemmatize(word, 'v') for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    return review

In [33]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [34]:
# Generalizing the score
actualScore = dataset['Score']
positiveNegative = actualScore.map(scorePartition)
dataset['Score'] = positiveNegative


In [35]:
dataset['Score']

0     1
1     0
2     1
3     0
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    0
13    1
14    1
15    1
16    0
17    1
18    1
19    1
20    1
21    1
22    1
Name: Score, dtype: int64

In [36]:
!pip install nltk



In [37]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
# creating the document corpus
corpus = []
for index, row in tqdm(dataset.iterrows()):
    review = doTextCleaning(row['Text'])
    corpus.append(review)

23it [00:00, 58.32it/s]


In [39]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer


In [40]:
#Creating a tranform
cv = CountVectorizer(ngram_range=(1,3), max_features = 5000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,6].values

In [41]:
X.shape

(23, 1358)

In [42]:
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [43]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [44]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)



In [45]:
# Predict the sentiment for new review
def predictNewReview():
    newReview = input("Type the Review: ")

    if newReview =='':
        print('Invalid Review')
    else:
        newReview = doTextCleaning(newReview)
        print(newReview)
        new_review1 = cv.transform([newReview]).toarray()
        print(new_review1)
        prediction =  classifier.predict(new_review1)
        print(prediction)
        if prediction[0] == 1:
            print( "Positive Review" )
        else:
            print( "Negative Review")

In [46]:
predictNewReview()

Type the Review:  Good


good
[[0 0 0 ... 0 0 0]]
[0]
Negative Review
