In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [2]:
INPUT_DIR="../data-nlp/input/"
OUTPUT_DIR="../data-nlp/output/"

RAND=10

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score

In [4]:
df = pd.read_csv(OUTPUT_DIR + "01_train_folds.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
 3   kfold      25000 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 781.4+ KB


In [5]:
df.review = df.review.astype(str)

In [6]:
def run_training(fold):

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    tfv = CountVectorizer()
    tfv.fit(df_train.review.values)

    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)

    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values

    clf = LogisticRegression()
    clf.fit(xtrain, ytrain)
    pred = clf.predict_proba(xvalid)[:, 1]
    
    auc = roc_auc_score(yvalid, pred)
    print(f"fold={fold}, auc={auc}")

    df_valid.loc[:, "lr_cnt_pred"] = pred

    return df_valid[["id", "sentiment", "kfold", "lr_cnt_pred"]]


In [7]:
df.review.describe()

count                                                 25000
unique                                                24904
top       You do realize that you've been watching the E...
freq                                                      3
Name: review, dtype: object

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
dfs = []
for fold in range(5):
    output_df = run_training(fold)
    dfs.append(output_df)

final_df = pd.concat(dfs)

fold=0, auc=0.94549216
fold=1, auc=0.94849344
fold=2, auc=0.94504128
fold=3, auc=0.9469352
fold=4, auc=0.94524608


In [10]:
final_df.head()

Unnamed: 0,id,sentiment,kfold,lr_cnt_pred
0,11703_9,1,0,0.999665
1,6742_8,1,0,0.986673
2,10071_1,0,0,0.465301
3,9841_7,1,0,0.999999
4,11579_10,1,0,0.985156


In [11]:
print(final_df.shape)
final_df.to_csv(f"{OUTPUT_DIR}02_basemodel_02_logreg_cnt.csv", index=False)

(25000, 4)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b1ab3b60-9130-40c7-8fa2-288950ad463c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>