In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt 
import copy
from sklearn.model_selection import train_test_split


In [2]:
names =[
    'artist','border_color','cmc','color_identity','colors','frame','futureshifted','name','power','promo','rarity',
    'released_at','reprint','reserved','set','toughness','creature','artifact','land','planeswalker','instant','sorcery',
    'type_line','future','frontier','modern','legacy','pauper','vintage','penny','commander','1v1','duel','brawl','standard'
]
df = pd.read_excel("scryfall-default-cards-cleaned-more.xlsx",names=names,na_values='?')
df1=df[['artist','border_color','frame','name','reserved','future','frontier','creature','artifact','rarity','land','planeswalker','instant','sorcery','modern','legacy','pauper','vintage','cmc','color_identity','standard']]

In [3]:
def precision_recall(yhat, y , threshold):
    filtering = lambda x : 1 if x > threshold else 0
    processed = list(map(filtering, yhat))
    
    true_pos = 0
    false_neg = 0
    false_pos = 0
    true_neg=0
    
    for i in range(y.shape[0]):
        if y[i] == processed[i]:
            if y[i] == 1:
                true_pos += 1
            else:
                true_neg+=1
        else:
            if y[i] == 1:
                false_neg += 1
            else:
                false_pos += 1
    
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    print('true positive= ',true_pos,'\nfalse positive= ',false_pos,'\ntrue negative= ',true_neg,'\nfalse negative= ',false_neg,'\ntotal= ',true_pos+false_neg+false_pos+true_neg)
    return precision,recall

In [4]:
df2=df1.drop_duplicates()
df1=df1.drop_duplicates()
df3=df1.copy()
df4=df1.copy()
print(df1.shape)
print(df2.shape)

(32137, 21)
(32137, 21)


In [5]:
#Limit to only all visually inspectable aspects of a card
df1=df1[['border_color','creature','artifact','rarity','land','planeswalker','instant','sorcery','color_identity','frame']]
cols_to_transform = ['border_color','rarity','color_identity','frame' ]

#Transform features from categorical into binary features for use in logistic regression
df_with_dummies = pd.get_dummies(df1, columns = cols_to_transform )
# print(df_with_dummies)

x=df_with_dummies.values
y=np.array(df2['standard'])
# x=x[:11000][:]
# y=y[:11000][:]
print(x.shape)
print(y.shape)

(32137, 52)
(32137,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=18)

In [7]:
logregL1 = LogisticRegression(C=100000000,penalty='l1',solver='liblinear')
logregL1.fit(X_train,y_train)
logregL2 = LogisticRegression(C=100000000,penalty='l2',solver='liblinear')
logregL2.fit(X_train,y_train)

LogisticRegression(C=100000000, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [8]:
#using L1 regularization
print("Using L1 Regularization:")
yhat = logregL1.predict(X_test)
# Find the accuracy achieved on training set.
acc = logregL1.score(X_train, y_train)
print("Accuracy on training data = %f" % acc)
# Find the accuracy achieved on test set.
val_acc = logregL1.score(X_test, y_test)
print("Accuracy on test data = %f" % val_acc)
precision_recall(yhat, y_test , 0.5)
print("\n")
#using L2 regularization
print("Using L2 Regularization:")
yhat2 = logregL2.predict(X_test)
# Find the accuracy achieved on training set.
acc2 = logregL2.score(X_train, y_train)
print("Accuracy on training data = %f" % acc2)
# Find the accuracy achieved on test set.
val_acc2 = logregL2.score(X_test, y_test)
print("Accuracy on test data = %f" % val_acc2)
precision_recall(yhat2, y_test , 0.5)


Using L1 Regularization:
Accuracy on training data = 0.933630
Accuracy on test data = 0.926364
true positive=  163 
false positive=  80 
true negative=  8769 
false negative=  630 
total=  9642


Using L2 Regularization:
Accuracy on training data = 0.933630
Accuracy on test data = 0.926364
true positive=  163 
false positive=  80 
true negative=  8769 
false negative=  630 
total=  9642


(0.6707818930041153, 0.2055485498108449)

In [9]:
df3=df[['set','name']]
df3=pd.get_dummies(df3, columns = ['set','name'] )

In [10]:
#checking how a lower number of features would perform

x=df3.values
y=np.array(df['standard'])
# x=x[:11000][:]
# y=y[:11000][:]

print(x.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=18)
logregL1 = LogisticRegression(C=100000000,penalty='l1',solver='liblinear')
logregL1.fit(X_train,y_train)
logregL2 = LogisticRegression(C=100000000,penalty='l2',solver='liblinear')
logregL2.fit(X_train,y_train)




#using L1 regularization
print("Using L1 Regualrization:")
yhat = logregL1.predict(X_test)
# Find the accuracy achieved on training set.
acc = logregL1.score(X_train, y_train)
print("Accuracy on training data = %f" % acc)
# Find the accuracy achieved on test set.
acc = logregL1.score(X_test, y_test)
print("Accuracy on test data = %f" % acc)
precision_recall(yhat, y_test , 0.5)

print("\n")
#using L2 regularization
print("Using L2 Regualrization:")
yhat2 = logregL2.predict(X_test)
# Find the accuracy achieved on training set.
acc2 = logregL2.score(X_train, y_train)
print("Accuracy on training data = %f" % acc2)
# Find the accuracy achieved on test set.
val_acc2 = logregL2.score(X_test, y_test)
print("Accuracy on test data = %f" % val_acc2)
precision_recall(yhat2, y_test , 0.5)


(45026, 19717)
(45026,)
Using L1 Regualrization:
Accuracy on training data = 1.000000
Accuracy on test data = 0.999408
true positive=  1628 
false positive=  0 
true negative=  11872 
false negative=  8 
total=  13508


Using L2 Regualrization:
Accuracy on training data = 1.000000
Accuracy on test data = 0.997631
true positive=  1604 
false positive=  0 
true negative=  11872 
false negative=  32 
total=  13508


(1.0, 0.980440097799511)