In [1]:
import nltk

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
paragraph = "This paper describes a weakly supervised system for sentiment analysis in the movie review domain. The objective is to classify a movie review into a polarity class, positive or negative, based on those sentences bearing opinion on the movie alone, leaving out other irrelevant text. Wikipedia incorporates the world knowledge of movie-specific features in the system which is used to obtain an extractive summary of the review, consisting of the reviewer’s opinions about the specific aspects of the movie. This filters out the concepts which are irrelevant or objective with respect to the given movie. The proposed system, WikiSent, does not require any labeled data for training. It achieves a better or comparable accuracy to the existing semi-supervised and unsupervised systems in the domain, on the same dataset. We also perform a general movie review trend analysis using WikiSent."

In [4]:
ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []

In [5]:
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus

['paper describes weakly supervised system sentiment analysis movie review domain',
 'objective classify movie review polarity class positive negative based sentence bearing opinion movie alone leaving irrelevant text',
 'wikipedia incorporates world knowledge movie specific feature system used obtain extractive summary review consisting reviewer opinion specific aspect movie',
 'filter concept irrelevant objective respect given movie',
 'proposed system wikisent require labeled data training',
 'achieves better comparable accuracy existing semi supervised unsupervised system domain dataset',
 'also perform general movie review trend analysis using wikisent']

In [7]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [8]:
X

array([[0.        , 0.        , 0.        , 0.        , 0.31114743,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37483764, 0.31114743, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.2022683 ,
        0.        , 0.        , 0.        , 0.        , 0.37483764,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.23090726, 0.        , 0.        , 0.        ,
        0.37483764, 0.        , 0.        , 0.31114743, 0.23090726,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37483764, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.26161031, 0.        , 0.        ,
        0.        , 0.26161031, 0.26161031, 0.        , 0.26161031,
        0.26161031, 0.        , 0.        , 0. 