In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# vectorize train and test data into data frames
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

vectorizer = TfidfVectorizer(min_df=5, tokenizer=case_punc_stop_lemm)
X_train_neg = vectorizer.fit_transform(X_train.Negative_Review)
X_test_neg = vectorizer.transform(X_test.Negative_Review)

# validate frequently appearing words in form of data frame
words = vectorizer.get_feature_names()
weights = np.array(X_train_neg.sum(axis=0)).reshape(-1,)
word_weight_df = pd.DataFrame({'word': words, 'weight': weights})
word_weight_df.sort_values(by='weight', ascending=False, inplace=True)
word_weight_df.reset_index(drop=True).head(10)

# transform results from sparse matrix format to data frame 
train_neg_df = pd.DataFrame.sparse.from_spmatrix(X_train_neg)
train_neg_df.columns = vectorizer.get_feature_names()

test_neg_df = pd.DataFrame.sparse.from_spmatrix(X_test_neg)
test_neg_df.columns = vectorizer.get_feature_names()

X_train_pos = vectorizer.fit_transform(X_train.Positive_Review)
train_pos_df = pd.DataFrame.sparse.from_spmatrix(X_train_pos)
train_pos_df.columns = vectorizer.get_feature_names()

X_test_pos = vectorizer.transform(X_test.Positive_Review)
test_pos_df = pd.DataFrame.sparse.from_spmatrix(X_test_pos)

train_pos_df = pd.DataFrame.sparse.from_spmatrix(X_train_pos)
train_pos_df.columns = vectorizer.get_feature_names()

test_pos_df = pd.DataFrame.sparse.from_spmatrix(X_test_pos)
test_pos_df.columns = vectorizer.get_feature_names()

X_train.drop(['Negative_Review', 'Positive_Review'], axis=1, inplace=True)
X_train.reset_index(drop=True, inplace=True)
X_train = pd.concat([X_train, train_neg_df, train_pos_df], axis=1)

X_test.drop(['Negative_Review', 'Positive_Review'], axis=1, inplace=True)
X_test.reset_index(drop=True, inplace=True)
X_test = pd.concat([X_test, test_neg_df, test_pos_df], axis=1)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))