In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression



Importing The Data

In [2]:
train = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', sep='\t')
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
test = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip', sep='\t')
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


Exploring The Data

In [4]:
train.shape

(25000, 3)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [6]:
train.select_dtypes('object').describe()

Unnamed: 0,id,review
count,25000,25000
unique,25000,24904
top,5814_8,"When i got this movie free from my job, along ..."
freq,1,3


Checking NULL and Duplicated Values

In [7]:
train.isnull().sum().sum()

0

In [8]:
train.duplicated().sum()

0

Droping ID Column

In [9]:
train.drop('id', axis=1, inplace=True)

In [10]:
submission = pd.DataFrame({'id': test['id']})

In [11]:
test.drop('id', axis=1, inplace=True)

Splitting The Training Data

In [12]:
X = train['review'].copy()
y = train['sentiment'].copy()

Vectorizer

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_prepared = tfidf_vectorizer.fit_transform(X)
test_prepared = tfidf_vectorizer.fit_transform(test['review'])

In [14]:
X_prepared.shape

(25000, 74538)

In [15]:
test_prepared.shape

(25000, 73511)

In [16]:
X_prepared = X_prepared[:, :test_prepared.shape[1]]

Logistic Regression

In [17]:
model = LogisticRegression()
model.fit(X_prepared, y)
y_predict = model.predict(test_prepared)

In [18]:
model.score(X_prepared, y)

0.937

In [19]:
submission['sentiment'] = y_predict

In [20]:
submission.to_csv('submission.csv', index=False)

In [21]:
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,1
2,5828_4,0
3,7186_2,1
4,12128_7,0
