In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import glob
import os

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
df = pd.read_csv(train_path)

<h1>Get to the correct data format</h1>

In [3]:
to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
          'pledged', 'goal', 'category', 'location', 'staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = df.drop(to_drop, axis=1).to_numpy(), df['state'].to_numpy()

In [4]:
X_train, y_train

(array([[4.000e+00, 9.000e+01, 2.500e+02, ..., 0.000e+00, 0.000e+00,
         1.000e+00],
        [9.000e+00, 1.000e+01, 6.400e+01, ..., 0.000e+00, 0.000e+00,
         1.000e+00],
        [5.000e+00, 2.000e+00, 1.290e+02, ..., 0.000e+00, 0.000e+00,
         1.000e+00],
        ...,
        [1.400e+01, 5.390e+02, 3.209e+03, ..., 0.000e+00, 0.000e+00,
         1.000e+00],
        [4.000e+00, 1.000e+01, 2.500e+01, ..., 0.000e+00, 0.000e+00,
         1.000e+00],
        [2.000e+00, 3.000e+01, 1.000e+02, ..., 0.000e+00, 0.000e+00,
         1.000e+00]]),
 array([1, 1, 1, ..., 1, 1, 0], dtype=int64))

<h3>Training using Logisitic Regression Model</h3>

In [5]:
RegressionModel = LogisticRegression(solver = 'lbfgs', max_iter = 200000)
RegressionModel.fit(X_train, y_train)
RegressionModel.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 200000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

<h3> Testing with training model output </h3>

In [6]:
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
test = pd.read_csv(test_path)

X_test, y_test = test.drop(to_drop, axis=1).to_numpy(), test['state'].to_numpy()

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1557, 540)
(1557,)
(293, 540)
(293,)


In [8]:
predictions = RegressionModel.predict(X_test)
score = RegressionModel.score(X_test, y_test)
score

0.8088737201365188

In [9]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

area_under_curve = roc_auc_score(y_test, predictions)
print("Area Under Curve:", area_under_curve)

[[113  31]
 [ 25 124]]
              precision    recall  f1-score   support

           0       0.82      0.78      0.80       144
           1       0.80      0.83      0.82       149

    accuracy                           0.81       293
   macro avg       0.81      0.81      0.81       293
weighted avg       0.81      0.81      0.81       293

Area Under Curve: 0.8084684936614468
