In [9]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

In [10]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [11]:
model_data = pd.read_csv("cleaned_data.csv")[:30001]
data = model_data.drop(['Target','patientId'], axis =1)
target = model_data['Target']

In [12]:
model_data

Unnamed: 0,patientId,Target,PatientAge,PatientSex_F,PatientSex_M,ViewPosition_AP,ViewPosition_PA
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,0,51.0,1,0,0,1
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,0,48.0,1,0,0,1
2,00322d4d-1c29-4943-afc9-b6754be640eb,0,19.0,0,1,1,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0,28.0,0,1,0,1
4,00436515-870c-4b36-a041-de91049b9ab4,1,32.0,1,0,1,0
...,...,...,...,...,...,...,...
29996,2c917d3a-95cb-4c11-802c-f83e28cb37bc,1,33.0,0,1,1,0
29997,2c917d3a-95cb-4c11-802c-f83e28cb37bc,1,33.0,0,1,1,0
29998,2c96c09b-aaa8-4c07-8e69-c2210f04be2d,0,63.0,1,0,1,0
29999,2c9a388f-0042-4b88-b52b-ea0b21fb7960,1,50.0,0,1,1,0


In [13]:
data

Unnamed: 0,PatientAge,PatientSex_F,PatientSex_M,ViewPosition_AP,ViewPosition_PA
0,51.0,1,0,0,1
1,48.0,1,0,0,1
2,19.0,0,1,1,0
3,28.0,0,1,0,1
4,32.0,1,0,1,0
...,...,...,...,...,...
29996,33.0,0,1,1,0
29997,33.0,0,1,1,0
29998,63.0,1,0,1,0
29999,50.0,0,1,1,0


In [14]:
target

0        0
1        0
2        0
3        0
4        1
        ..
29996    1
29997    1
29998    0
29999    1
30000    0
Name: Target, Length: 30001, dtype: int64

In [15]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []


for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(data, target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.711676,"{'C': 20, 'kernel': 'rbf'}"
1,random_forest,0.713876,{'n_estimators': 10}
2,logistic_regression,0.698777,{'C': 1}
3,naive_bayes_gaussian,0.69271,{}
4,naive_bayes_multinomial,0.690777,{}
5,decision_tree,0.71451,{'criterion': 'gini'}


In [16]:
import pickle

In [16]:
with open('svm_pickel', 'wb') as f:
    pickle.dump(model_data, f)

In [17]:
with open('svm_pickel', 'rb') as f:
    mp = pickle.load(f)

In [29]:
model_data.head(10)

Unnamed: 0,patientId,Target,PatientAge,PatientSex_F,PatientSex_M,ViewPosition_AP,ViewPosition_PA
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,0,51.0,1,0,0,1
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,0,48.0,1,0,0,1
2,00322d4d-1c29-4943-afc9-b6754be640eb,0,19.0,0,1,1,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0,28.0,0,1,0,1
4,00436515-870c-4b36-a041-de91049b9ab4,1,32.0,1,0,1,0
5,00436515-870c-4b36-a041-de91049b9ab4,1,32.0,1,0,1,0
6,00569f44-917d-4c86-a842-81832af98c30,0,54.0,0,1,1,0
7,006cec2e-6ce2-4549-bffa-eadfcd1e9970,0,78.0,0,1,0,1
8,00704310-78a8-4b38-8475-49f4573b2dbb,1,75.0,0,1,0,1
9,00704310-78a8-4b38-8475-49f4573b2dbb,1,75.0,0,1,0,1


In [17]:
svm_model=svm.SVC(gamma='auto',C=20,kernel='rbf',probability=True, random_state=0)
svm_model.fit(data, target)
svm_model.score(data,target)

0.7248091730275658

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3)

In [19]:
print(X_test)

       PatientAge  PatientSex_F  PatientSex_M  ViewPosition_AP  \
16534        48.0             1             0                1   
3710         54.0             0             1                0   
25646        60.0             0             1                1   
7397         16.0             0             1                0   
24595        45.0             0             1                0   
...           ...           ...           ...              ...   
11124        61.0             1             0                0   
17844        46.0             0             1                1   
14275        55.0             0             1                0   
17759        60.0             0             1                0   
19440        44.0             1             0                1   

       ViewPosition_PA  
16534                0  
3710                 1  
25646                0  
7397                 1  
24595                1  
...                ...  
11124                1  
17844  

In [20]:
svm_model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
svm_model.score(X_train, Y_train)

0.7238571428571429

In [22]:
svm_model.score(X_test, Y_test)

0.7270303299633374

In [23]:
import pickle

In [24]:
with open('svm_pickle_probability_30k', 'wb') as f:
    pickle.dump(svm_model, f)

In [25]:
with open('svm_pickle_probability_30k', 'rb') as f:
    mp = pickle.load(f)

In [26]:
svm_model.predict_proba(X_test)

array([[0.75156184, 0.24843816],
       [0.77853173, 0.22146827],
       [0.77857351, 0.22142649],
       ...,
       [0.7786026 , 0.2213974 ],
       [0.77849724, 0.22150276],
       [0.75667954, 0.24332046]])

In [27]:
mp.predict_proba(X_test)

array([[0.75156184, 0.24843816],
       [0.77853173, 0.22146827],
       [0.77857351, 0.22142649],
       ...,
       [0.7786026 , 0.2213974 ],
       [0.77849724, 0.22150276],
       [0.75667954, 0.24332046]])

In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
patient_details=[[50,1,0,0,1]]

In [34]:
mp.predict_proba(patient_details)

array([[0.77853947, 0.22146053]])

In [2]:
# pip install --user scikit-learn --upgrade

In [28]:
import sklearn
print(sklearn.__version__)

1.1.2


In [9]:
# model_data2 = pd.read_csv("Cleaned_Data_Final.csv")[:30000]
# #model_data2.drop(model_data2[0], axis=1, inplace=True)
# # model_data2 = model_data2.drop(model_data2.iloc[:, 0], axis=1, inplace=True)
# # #model_data2.iloc[:, 0]
# model_data2

In [10]:
# data2 = model_data2.drop(['Target','patientId'], axis =1)
# target2 = model_data2['Target']

In [11]:
# model_data2

In [12]:
# data2

In [13]:
#target2.head(30)

In [14]:
# model_params2 = {
#     'svm': {
#         'model': svm.SVC(gamma='auto'),
#         'params' : {
#             'C': [1,10,20],
#             'kernel': ['rbf','linear']
#         }  
#     },
#     'random_forest': {
#         'model': RandomForestClassifier(),
#         'params' : {
#             'n_estimators': [1,5,10],
         
#         }
#     },
#     'logistic_regression' : {
#         'model': LogisticRegression(solver='liblinear',multi_class='auto'),
#         'params': {
#             'C': [1,5,10]
#         }
#     },
#     'naive_bayes_gaussian': {
#         'model': GaussianNB(),
#         'params': {}
#     },
#     'naive_bayes_multinomial': {
#         'model': MultinomialNB(),
#         'params': {}
#     },
#     'decision_tree': {
#         'model': DecisionTreeClassifier(),
#         'params': {
#             'criterion': ['gini','entropy'],
       
#         }
#     }     
# }

In [15]:
# from sklearn.model_selection import GridSearchCV
# import pandas as pd
# scores = []


# for model_name, mp in model_params2.items():
#     clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
#     clf.fit(data2, target2)
#     scores.append({
#         'model': model_name,
#         'best_score': clf.best_score_,
#         'best_params': clf.best_params_
#     })
    
# df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
# df