In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ast

In [2]:
df=sns.load_dataset('titanic')

In [3]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [4]:
df = df.drop(['deck','embark_town','alive','class','who','alone','adult_male'],axis=1)
df = df.dropna(subset=['embarked','age'])

In [5]:
df.shape

(712, 8)

In [6]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

df.iloc[:,2] = labelencoder.fit_transform(df.iloc[:,2])
df.iloc[:,7] = labelencoder.fit_transform(df.iloc[:,7])

In [7]:
X = df.iloc[:,1:8].values
Y = df.iloc[:, 0].values

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=0)

In [9]:
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



def models(X_train,Y_train):
    log = LogisticRegression(random_state=0)
    log.fit(X_train,Y_train)

    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski',p=2)
    knn.fit(X_train,Y_train)

    svc_line = SVC(kernel='linear',random_state=0)
    svc_line.fit(X_train,Y_train)

    svc_rbf = SVC(kernel='rbf',random_state=0)
    svc_rbf.fit(X_train,Y_train)

    gauss = GaussianNB()
    gauss.fit(X_train,Y_train)

    tree = DecisionTreeClassifier(criterion = 'entropy',random_state=0)
    tree.fit(X_train,Y_train)

    forest = RandomForestClassifier(n_estimators=10, criterion = 'entropy',random_state=0)
    forest.fit(X_train,Y_train)

    print('[0]Logistic Regression Accuracy: ' ,log.score(X_train,Y_train)*100)
    print('[1]KNN Regression Accuracy: ' ,knn.score(X_train,Y_train)*100)
    print('[2]SVC_Line Regression Accuracy: ' ,svc_line.score(X_train,Y_train)*100)
    print('[3]SVC_RBf Regression Accuracy: ' ,svc_rbf.score(X_train,Y_train)*100)
    print('[4]Gauss Regression Accuracy: ' ,gauss.score(X_train,Y_train)*100)
    print('[5]Tree Regression Accuracy: ' ,tree.score(X_train,Y_train)*100)
    print('[6]Forest Regression Accuracy: ' ,forest.score(X_train,Y_train)*100)
    
    return log, knn, svc_line, svc_rbf, gauss, tree, forest


In [11]:
models = models(X_train,Y_train)

[0]Logistic Regression Accuracy:  79.78910369068541
[1]KNN Regression Accuracy:  86.64323374340948
[2]SVC_Line Regression Accuracy:  77.68014059753953
[3]SVC_RBf Regression Accuracy:  85.06151142355009
[4]Gauss Regression Accuracy:  80.31634446397187
[5]Tree Regression Accuracy:  99.29701230228471
[6]Forest Regression Accuracy:  97.53954305799648


In [131]:
from sklearn.metrics import confusion_matrix
# accuracy_score, confusion_matrix, precision_score, recall_score, f1_score,roc_auc_score, roc_curve


for i in range(len(models)):
    cm = confusion_matrix(Y_test,models[i].predict(X_test))
    
    TN, FP, FN, TP = confusion_matrix(Y_test,models[i].predict(X_test)).ravel()
    
    test_score = (TP+TN) / (TP + TN + FN + FP)
    print(cm)
    print(f'Model[{i}] Testing Accuracy = {test_score}')
    print()

[[75  7]
 [19 42]]
Model[0] Testing Accuracy = 0.8181818181818182

[[67 15]
 [20 41]]
Model[1] Testing Accuracy = 0.7552447552447552

[[70 12]
 [18 43]]
Model[2] Testing Accuracy = 0.7902097902097902

[[75  7]
 [22 39]]
Model[3] Testing Accuracy = 0.7972027972027972

[[69 13]
 [27 34]]
Model[4] Testing Accuracy = 0.7202797202797203

[[54 28]
 [12 49]]
Model[5] Testing Accuracy = 0.7202797202797203

[[73  9]
 [14 47]]
Model[6] Testing Accuracy = 0.8391608391608392



In [132]:
forest = models[6]
importances = pd.DataFrame({'feature':df.iloc[:,1:8].columns, 
                            'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).reset_index(drop=True)
importances

Unnamed: 0,feature,importance
0,age,0.3
1,fare,0.296
2,sex,0.183
3,pclass,0.098
4,sibsp,0.05
5,parch,0.044
6,embarked,0.03


In [133]:
pred = models[6].predict(X_test)
print(pred)
print()
print(Y_test)

[1 0 1 0 0 0 1 0 0 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1
 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 1 1 0 1 1 1
 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0 0 1 0 0 1 0 1 0 0 0 0
 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 0 0 0 0 0 1]

[0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1 0 1
 1 1 1 1 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 1 1 1
 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0
 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 1]


In [134]:
from sklearn.preprocessing import StandardScaler

my_sur = [[3,1,21,0,0,0,1]]
print(len(my_sur[0]))

my_sur = [[1,1,0,38.0,1,0,71.2833]]

sc=StandardScaler()
my_sur_scaled = sc.fit_transform(my_sur)

pred = models[6].predict(my_sur_scaled)
# print(pred)

if pred==0:
    print('fail')
else:
    print('pass')

7
fail


In [189]:
df=pd.read_csv('./data/fdb.csv')
df['news_sentiment_score']= df['news_sentiment_score'].fillna(0)
df['amount_of_articles']= df['amount_of_articles'].fillna(0)
df['headline']=df['headline'].fillna('[]')
df['headline']=df['headline'].apply(lambda x: ast.literal_eval(x))
df

Unnamed: 0,Ticker,Date,Open,Close,Volume,headline,news_sentiment_score,amount_of_articles,Same_day_move,from_previous_day
0,TSLA,2021-01-04,719.460022,729.770020,48638200,"[2 Tesla Analysts Break Down Fundamentals, Val...",1.71,55.0,1,0
1,TSLA,2021-01-05,723.659973,735.109985,32245200,[Tesla Gets Regulatory Nod To Begin Sales In I...,1.76,41.0,1,0
2,TSLA,2021-01-06,758.489990,755.979980,44700000,[Tesla Option Traders Are Dumping Massive Amou...,1.83,48.0,0,1
3,TSLA,2021-01-07,777.630005,816.039978,51498900,"[Elon Musk, Jeff Bezos Battling It Out For The...",2.04,45.0,1,1
4,TSLA,2021-01-08,856.000000,880.020020,75055500,"[Tesla To Launch $25,000 EV Sedan In China By ...",1.71,41.0,1,1
...,...,...,...,...,...,...,...,...,...,...
319,AMD,2021-04-23,80.209999,82.760002,49194000,"[What to Expect From AMD This Earnings Season,...",1.87,23.0,1,1
320,AMD,2021-04-26,83.349998,85.410004,57594500,[Chipmaker stocks are going to be ‘the key’ to...,1.95,22.0,1,1
321,AMD,2021-04-27,85.669998,85.209999,61909900,"[Dow Jones Futures: Microsoft, Google, AMD Lea...",1.93,59.0,0,1
322,AMD,2021-04-28,88.849998,84.019997,108920300,"[U.S. stocks end lower, despite Powell vow to ...",2.00,53.0,0,1


In [190]:
# df = df.drop(['Ticker','Date','headline'],axis=1)

In [191]:
df

Unnamed: 0,Ticker,Date,Open,Close,Volume,headline,news_sentiment_score,amount_of_articles,Same_day_move,from_previous_day
0,TSLA,2021-01-04,719.460022,729.770020,48638200,"[2 Tesla Analysts Break Down Fundamentals, Val...",1.71,55.0,1,0
1,TSLA,2021-01-05,723.659973,735.109985,32245200,[Tesla Gets Regulatory Nod To Begin Sales In I...,1.76,41.0,1,0
2,TSLA,2021-01-06,758.489990,755.979980,44700000,[Tesla Option Traders Are Dumping Massive Amou...,1.83,48.0,0,1
3,TSLA,2021-01-07,777.630005,816.039978,51498900,"[Elon Musk, Jeff Bezos Battling It Out For The...",2.04,45.0,1,1
4,TSLA,2021-01-08,856.000000,880.020020,75055500,"[Tesla To Launch $25,000 EV Sedan In China By ...",1.71,41.0,1,1
...,...,...,...,...,...,...,...,...,...,...
319,AMD,2021-04-23,80.209999,82.760002,49194000,"[What to Expect From AMD This Earnings Season,...",1.87,23.0,1,1
320,AMD,2021-04-26,83.349998,85.410004,57594500,[Chipmaker stocks are going to be ‘the key’ to...,1.95,22.0,1,1
321,AMD,2021-04-27,85.669998,85.209999,61909900,"[Dow Jones Futures: Microsoft, Google, AMD Lea...",1.93,59.0,0,1
322,AMD,2021-04-28,88.849998,84.019997,108920300,"[U.S. stocks end lower, despite Powell vow to ...",2.00,53.0,0,1


In [192]:
df.shape

(324, 10)

In [194]:
df = df.drop(['Date','headline'],axis=1)
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

df.iloc[:,0] = labelencoder.fit_transform(df.iloc[:,0])


In [195]:
df

Unnamed: 0,Ticker,Open,Close,Volume,news_sentiment_score,amount_of_articles,Same_day_move,from_previous_day
0,3,719.460022,729.770020,48638200,1.71,55.0,1,0
1,3,723.659973,735.109985,32245200,1.76,41.0,1,0
2,3,758.489990,755.979980,44700000,1.83,48.0,0,1
3,3,777.630005,816.039978,51498900,2.04,45.0,1,1
4,3,856.000000,880.020020,75055500,1.71,41.0,1,1
...,...,...,...,...,...,...,...,...
319,0,80.209999,82.760002,49194000,1.87,23.0,1,1
320,0,83.349998,85.410004,57594500,1.95,22.0,1,1
321,0,85.669998,85.209999,61909900,1.93,59.0,0,1
322,0,88.849998,84.019997,108920300,2.00,53.0,0,1


In [196]:
X = df.iloc[:,:7].values
Y = df.iloc[:, 7].values

In [197]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=0,stratify=df['Ticker'])

In [198]:
sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [199]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



def models(X_train,Y_train):
    log = LogisticRegression(random_state=0)
    log.fit(X_train,Y_train)

    knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski',p=2)
    knn.fit(X_train,Y_train)

    svc_line = SVC(kernel='linear',random_state=0)
    svc_line.fit(X_train,Y_train)

    svc_rbf = SVC(kernel='rbf',random_state=0)
    svc_rbf.fit(X_train,Y_train)

    gauss = GaussianNB()
    gauss.fit(X_train,Y_train)

    tree = DecisionTreeClassifier(criterion = 'entropy',random_state=0)
    tree.fit(X_train,Y_train)

    forest = RandomForestClassifier(n_estimators=10, criterion = 'entropy',random_state=0)
    forest.fit(X_train,Y_train)

    print('[0]Logistic Regression Accuracy: ' ,log.score(X_train,Y_train)*100)
    print('[1]KNN Regression Accuracy: ' ,knn.score(X_train,Y_train)*100)
    print('[2]SVC_Line Regression Accuracy: ' ,svc_line.score(X_train,Y_train)*100)
    print('[3]SVC_RBf Regression Accuracy: ' ,svc_rbf.score(X_train,Y_train)*100)
    print('[4]Gauss Regression Accuracy: ' ,gauss.score(X_train,Y_train)*100)
    print('[5]Tree Regression Accuracy: ' ,tree.score(X_train,Y_train)*100)
    print('[6]Forest Regression Accuracy: ' ,forest.score(X_train,Y_train)*100)
    
    return log, knn, svc_line, svc_rbf, gauss, tree, forest


In [200]:
models = models(X_train,Y_train)

[0]Logistic Regression Accuracy:  59.84555984555985
[1]KNN Regression Accuracy:  70.65637065637065
[2]SVC_Line Regression Accuracy:  59.07335907335908
[3]SVC_RBf Regression Accuracy:  61.38996138996139
[4]Gauss Regression Accuracy:  59.07335907335908
[5]Tree Regression Accuracy:  100.0
[6]Forest Regression Accuracy:  98.84169884169884


In [201]:
for i in range(len(models)):
    cm = confusion_matrix(Y_test,models[i].predict(X_test))
    
    TN, FP, FN, TP = confusion_matrix(Y_test,models[i].predict(X_test)).ravel()
    
    test_score = (TP+TN) / (TP + TN + FN + FP)
    print(cm)
    print(f'Model[{i}] Testing Accuracy = {test_score*100}')
    print()

[[ 2 25]
 [ 9 29]]
Model[0] Testing Accuracy = 47.69230769230769

[[ 8 19]
 [16 22]]
Model[1] Testing Accuracy = 46.15384615384615

[[ 0 27]
 [ 0 38]]
Model[2] Testing Accuracy = 58.46153846153847

[[ 1 26]
 [ 0 38]]
Model[3] Testing Accuracy = 60.0

[[ 6 21]
 [ 7 31]]
Model[4] Testing Accuracy = 56.92307692307692

[[11 16]
 [15 23]]
Model[5] Testing Accuracy = 52.307692307692314

[[12 15]
 [15 23]]
Model[6] Testing Accuracy = 53.84615384615385



In [202]:
pred = models[3].predict(X_test)
print(pred)
print()
print(Y_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1]

[1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1
 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1]


In [205]:
# 1
forest = models[3]
importances = pd.DataFrame({'feature':df.iloc[:,:7].columns, 
                            'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).reset_index(drop=True)
importances

AttributeError: 'SVC' object has no attribute 'feature_importances_'