In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df["response"] = le.fit_transform(train_df["response"])
train_df.head()



Unnamed: 0,response,review
0,1,A wonderful little production. <br /><br />The...
1,0,Basically there's a family where a little boy ...
2,0,"This show was an amazing, fresh & innovative i..."
3,0,So im not a big fan of Boll's work but then ag...
4,0,"Kind of drawn in by the erotic scenes, only to..."


In [2]:
# creating function for cleaning data
def standardize_text(df, content_field):
    df[content_field] = df[content_field].str.replace(r"http\S+", "")
    df[content_field] = df[content_field].str.replace(r"http", "")
    df[content_field] = df[content_field].str.replace(r"@\S+", "")
    df[content_field] = df[content_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[content_field] = df[content_field].str.replace(r"@", "at")
    df[content_field] = df[content_field].str.lower()
    return df
standardize_text(train_df,"review")


  df[content_field] = df[content_field].str.replace(r"http\S+", "")
  df[content_field] = df[content_field].str.replace(r"@\S+", "")
  df[content_field] = df[content_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")


Unnamed: 0,response,review
0,1,a wonderful little production br br the...
1,0,basically there's a family where a little boy ...
2,0,"this show was an amazing, fresh innovative i..."
3,0,so im not a big fan of boll's work but then ag...
4,0,"kind of drawn in by the erotic scenes, only to..."
...,...,...
14906,0,robert colomb has two full time jobs he's kno...
14907,0,this is your typical junk comedy br br t...
14908,1,i thought this movie did a down right good job...
14909,0,"bad plot, bad dialogue, bad acting, idiotic di..."


In [3]:
def remove_abb(review):
    replacements = {
       "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "could've": "could have",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "gonna": "going to",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "how'd": "how did",
        "how'll": "how will",
        "how's": "how is",
        "I'd": "I would",
        "I'll": "I will",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it'd": "it would",
        "it'll": "it will",
        "it's": "it is",
        "Its" : "It is",
        "let's": "let us",
        "mightn't": "might not",
        "mustn't": "must not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "wanna" : "want to",
        "We're" : "We are"
    }
    for key, value in replacements.items():
        review = re.sub(r"{}".format(key), value, review)
    return review

train_df["review"] = train_df["review"].apply(remove_abb)


In [4]:
from nltk.stem import SnowballStemmer

In [5]:
lemmatizer=WordNetLemmatizer()
corpus = []
for i in range(0,len(train_df)):
    review = re.sub('[^a-zA-Z]', ' ', train_df['review'][i])
    review = review.split()
    review=[word for word in review if not word in set(stopwords.words('english'))]
    review = [SnowballStemmer("english").stem(words) for words in review]
    review = [lemmatizer.lemmatize(word) for word in review]
    review=' '.join(review)
    corpus.append(review)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [7]:
tfidf = TfidfVectorizer(ngram_range=(1, 3))
X = tfidf.fit_transform(corpus)
y = train_df["response"]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform

# Define the parameter space to search over
param_distributions = {
    'C': loguniform(1, 10),
    'tol': loguniform(1e-5, 1e-3),
}

classifier = LogisticRegression()

random_lr = RandomizedSearchCV(
    classifier, 
    param_distributions=param_distributions, 
    n_iter=50,
    cv=5, 
    verbose=1, 
    n_jobs=-1,
    random_state=42,
    error_score='raise'
)


In [None]:
random_lr.fit(X_train, y_train)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
# Get the best model
best_model = random_lr.best_estimator_
print(best_model)
# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns

acc = accuracy_score(y_pred, y_test)
report = classification_report(y_pred, y_test)
cm = confusion_matrix(y_pred,y_test)
sns.heatmap(cm, annot=True)
print(report)
print("Accuracy Score of SVC:", acc*100,"%")

In [None]:
# Make predictions on the test set
test_df_transformed = tfidf.transform(test_df['review'])
y_test_pred = best_model.predict(test_df_transformed)

In [None]:
for i in range(len(y_test_pred)):
    if y_test_pred[i] == 0:
        y_test_pred[i] = 1
    else:
        y_test_pred[i] = 2

In [None]:
# Output the predictions to a .txt file for submission
export_pred = pd.DataFrame(y_test_pred, columns=['response'])
#export_pred.to_csv('nltk_prediction_lr_rs.txt', index=False, header=False)