# Sınıflandırma Problemleri

In [1]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"veri_transform.csv")

In [3]:
X = df.drop(["fiyat"], axis = 1)
y = df["fiyat"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

# Lojistik Regresyon 

## Model 

In [None]:
diabetes = pd.read_csv("diabetes.csv")
df = diabetes.copy()
df = df.dropna()
df.head()

In [None]:
df.info()

In [None]:
df["Outcome"].value_counts()

In [None]:
df["Outcome"].value_counts().plot.barh();

In [None]:
df.describe().T

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

In [None]:
#statsmodels

In [None]:
loj = sm.Logit(y, X)
loj_model= loj.fit()
loj_model.summary()

In [None]:
#scikit-learn

In [None]:
from sklearn.linear_model import LogisticRegression
loj = LogisticRegression(solver = "liblinear")
loj_model = loj.fit(X,y)
loj_model

In [None]:
loj_model.intercept_

In [None]:
loj_model.coef_

## Tahmin & Model Tuning

In [None]:
y_pred = loj_model.predict(X)

In [None]:
confusion_matrix(y, y_pred)

In [None]:
accuracy_score(y, y_pred)

In [None]:
print(classification_report(y, y_pred))

In [None]:
loj_model.predict(X)[0:10]

In [None]:
loj_model.predict_proba(X)[0:10][:,0:2]

In [None]:
y[0:10]

In [None]:
y_probs = loj_model.predict_proba(X)
y_probs = y_probs[:,1]

In [None]:
y_probs[0:10]

In [None]:
y_pred = [1 if i > 0.5 else 0 for i in y_probs]

In [None]:
y_pred[0:10]

In [None]:
confusion_matrix(y, y_pred)

In [None]:
accuracy_score(y, y_pred)

In [None]:
print(classification_report(y, y_pred))

In [None]:
loj_model.predict_proba(X)[:,1][0:5]

In [None]:
logit_roc_auc = roc_auc_score(y, loj_model.predict(X))

fpr, tpr, thresholds = roc_curve(y, loj_model.predict_proba(X)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Oranı')
plt.ylabel('True Positive Oranı')
plt.title('ROC')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.30, 
                                                    random_state = 42)


In [None]:
loj = LogisticRegression(solver = "liblinear")
loj_model = loj.fit(X_train,y_train)
loj_model

In [None]:
accuracy_score(y_test, loj_model.predict(X_test))

In [None]:
cross_val_score(loj_model, X_test, y_test, cv = 10).mean()

# Gaussian Naive Bayes

In [5]:
from sklearn.naive_bayes import GaussianNB

In [6]:
nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)
nb_model

GaussianNB()

In [None]:
nb_model.predict(X_test)[0:10]

In [None]:
nb_model.predict_proba(X_test)[0:10]

In [7]:
y_pred = nb_model.predict(X_test)

In [8]:
accuracy_score(y_test, y_pred)

0.0016625103906899418

In [9]:
cross_val_score(nb_model, X_test, y_test, cv = 10).mean()

0.005

In [10]:
nb_model.score(X_test,y_test)

0.0016625103906899418

# KNN

## Model & Tahmin

In [11]:
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train, y_train)
knn_model

KNeighborsClassifier()

In [12]:
y_pred = knn_model.predict(X_test)

In [13]:
accuracy_score(y_test, y_pred)

0.011637572734829594

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     75000.0       0.00      0.00      0.00         0
     77000.0       0.00      0.00      0.00         0
     79000.0       0.00      0.00      0.00         0
     80000.0       0.00      0.00      0.00         1
     99000.0       0.00      0.00      0.00         1
    109000.0       0.00      0.00      0.00         1
    110000.0       0.00      0.00      0.00         0
    113000.0       0.00      0.00      0.00         2
    115000.0       0.00      0.00      0.00         2
    115500.0       0.00      0.00      0.00         0
    117000.0       0.00      0.00      0.00         0
    120000.0       0.00      0.00      0.00         3
    125000.0       0.00      0.00      0.00         0
    128000.0       0.00      0.00      0.00         0
    130000.0       0.00      0.00      0.00         2
    132000.0       0.00      0.00      0.00         1
    135000.0       0.00      0.00      0.00         3
    136000.0       0.00    

In [15]:
knn_model.score(X_test,y_test)

0.011637572734829594

## Model Tuning

In [None]:
knn_params = {"n_neighbors": np.arange(1,50)}

In [None]:
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_cv.fit(X_train, y_train)

In [None]:
print("En iyi skor:" + str(knn_cv.best_score_))
print("En iyi parametreler: " + str(knn_cv.best_params_))

In [None]:
knn = KNeighborsClassifier(11)
knn_tuned = knn.fit(X_train, y_train)

In [None]:
knn_tuned.score(X_test, y_test)

In [None]:
y_pred = knn_tuned.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

# SVC

## Model & Tahmin

In [None]:
svm_model = SVC(kernel = "linear").fit(X_train, y_train)

In [None]:
svm_model

In [None]:
y_pred = svm_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
svc_params = {"C": np.arange(1,10)}

svc = SVC(kernel = "linear")

svc_cv_model = GridSearchCV(svc,svc_params, 
                            cv = 10, 
                            n_jobs = -1, 
                            verbose = 2 )

svc_cv_model.fit(X_train, y_train)

In [None]:
print("En iyi parametreler: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned = SVC(kernel = "linear", C = 5).fit(X_train, y_train)

In [None]:
y_pred = svc_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# RBF SVC

## Model & Tahmin

In [None]:
svc_model = SVC(kernel = "rbf").fit(X_train, y_train)

In [None]:
svc_model

In [None]:
y_pred = svc_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
svc_params = {"C": [0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100],
             "gamma": [0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100]}

In [None]:
svc = SVC()
svc_cv_model = GridSearchCV(svc, svc_params, 
                         cv = 10, 
                         n_jobs = -1,
                         verbose = 2)

svc_cv_model.fit(X_train, y_train)

In [None]:
print("En iyi parametreler: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned = SVC(C = 10, gamma = 0.0001).fit(X_train, y_train)

In [None]:
y_pred = svc_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# Yapay Sinir Ağları

## Model & Tahmin

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler  

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_test_scaled[0:5]

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlpc = MLPClassifier().fit(X_train_scaled, y_train)

In [None]:
y_pred = mlpc.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
mlpc

In [None]:
?mlpc

In [None]:
mlpc_params = {"alpha": [0.1, 0.01, 0.02, 0.005, 0.0001,0.00001],
              "hidden_layer_sizes": [(10,10,10),
                                     (100,100,100),
                                     (100,100),
                                     (3,5), 
                                     (5, 3)],
              "solver" : ["lbfgs","adam","sgd"],
              "activation": ["relu","logistic"]}


In [None]:
mlpc = MLPClassifier()
mlpc_cv_model = GridSearchCV(mlpc, mlpc_params, 
                         cv = 10, 
                         n_jobs = -1,
                         verbose = 2)

mlpc_cv_model.fit(X_train_scaled, y_train)

In [None]:
print("En iyi parametreler: " + str(mlpc_cv_model.best_params_))

In [None]:
mlpc_tuned = MLPClassifier(activation = "logistic", 
                           alpha = 0.1, 
                           hidden_layer_sizes = (100, 100, 100),
                          solver = "adam")

In [None]:
mlpc_tuned.fit(X_train_scaled, y_train)

In [None]:
y_pred = mlpc_tuned.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

# CART

## Model & Tahmin

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
cart = DecisionTreeClassifier()
cart_model = cart.fit(X_train, y_train)

In [None]:
cart_model

In [None]:
#!pip install skompiler
from skompiler import skompile
print(skompile(cart_model.predict).to("python/code"))

In [None]:
x = [9]

In [None]:
((0 if x[0] <= 2.5 else 0) if x[0] <= 6.5 else 1 if x[0] <= 13.5 else 1)

In [None]:
y_pred = cart_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
cart_model

In [None]:
?cart_model

In [None]:
cart_grid = {"max_depth": range(1,10),
            "min_samples_split" : list(range(2,50)) }

In [None]:
cart = tree.DecisionTreeClassifier()
cart_cv = GridSearchCV(cart, cart_grid, cv = 10, n_jobs = -1, verbose = 2)
cart_cv_model = cart_cv.fit(X_train, y_train)

In [None]:
print("En iyi parametreler: " + str(cart_cv_model.best_params_))

In [None]:
#final

In [None]:
cart = tree.DecisionTreeClassifier(max_depth = 5, min_samples_split = 19)
cart_tuned = cart.fit(X_train, y_train)

In [None]:
y_pred = cart_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# Random Forests

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
rf_model

In [None]:
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
rf_model

In [None]:
?rf_model

In [None]:
rf_params = {"max_depth": [2,5,8,10],
            "max_features": [2,5,8],
            "n_estimators": [10,500,1000],
            "min_samples_split": [2,5,10]}

In [None]:
rf_model = RandomForestClassifier()

rf_cv_model = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                           n_jobs = -1, 
                           verbose = 2) 

In [None]:
rf_cv_model.fit(X_train, y_train)

In [None]:
print("En iyi parametreler: " + str(rf_cv_model.best_params_))

In [None]:
#final

In [None]:
rf_tuned = RandomForestClassifier(max_depth = 10, 
                                  max_features = 8, 
                                  min_samples_split = 10,
                                  n_estimators = 1000)

rf_tuned.fit(X_train, y_train)

In [None]:
y_pred = rf_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
Importance = pd.DataFrame({"Importance": rf_tuned.feature_importances_*100},
                         index = X_train.columns)

In [None]:
Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r")

plt.xlabel("Değişken Önem Düzeyleri")

# Gradient Boosting Machines

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

In [None]:
y_pred = gbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
gbm_model

In [None]:
?gbm_model

In [None]:
gbm_params = {"learning_rate" : [0.001, 0.01, 0.1, 0.05],
             "n_estimators": [100,500,100],
             "max_depth": [3,5,10],
             "min_samples_split": [2,5,10]}

In [None]:
gbm = GradientBoostingClassifier()

gbm_cv = GridSearchCV(gbm, gbm_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
gbm_cv.fit(X_train, y_train)

In [None]:
print("En iyi parametreler: " + str(gbm_cv.best_params_))

In [None]:
gbm = GradientBoostingClassifier(learning_rate = 0.01, 
                                 max_depth = 3,
                                min_samples_split = 5,
                                n_estimators = 500)

In [None]:
gbm_tuned =  gbm.fit(X_train,y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# XGBoost

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
#!pip install xgboost
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
xgb_model

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
xgb_model

In [None]:
?xgb_model

In [None]:
xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_samples_split": [2,5,10]}

In [None]:
xgb = XGBClassifier()

xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
xgb_cv_model.fit(X_train, y_train)

In [None]:
xgb_cv_model.best_params_

In [None]:
xgb = XGBClassifier(learning_rate = 0.01, 
                    max_depth = 6,
                    min_samples_split = 2,
                    n_estimators = 100,
                    subsample = 0.8)

In [None]:
xgb_tuned =  xgb.fit(X_train,y_train)

In [None]:
y_pred = xgb_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# LightGBM

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
#!conda install -c conda-forge lightgbm
from lightgbm import LGBMClassifier

In [None]:
lgbm_model = LGBMClassifier().fit(X_train, y_train)

In [None]:
y_pred = lgbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
## Model Tuning

In [None]:
lgbm_model

In [None]:
?lgbm_model

In [None]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [None]:
lgbm = LGBMClassifier()

lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, 
                             cv = 10, 
                             n_jobs = -1, 
                             verbose = 2)



In [None]:
lgbm_cv_model.fit(X_train, y_train)

In [None]:
lgbm_cv_model.best_params_

In [None]:
lgbm = LGBMClassifier(learning_rate = 0.01, 
                       max_depth = 3,
                       subsample = 0.6,
                       n_estimators = 500,
                       min_child_samples = 20)

In [None]:
lgbm_tuned = lgbm.fit(X_train,y_train)

In [None]:
y_pred = lgbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# CatBoost

## Model & Tahmin

In [None]:
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
#X = df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
#!pip install catboost
from catboost import CatBoostClassifier

In [None]:
cat_model = CatBoostClassifier().fit(X_train, y_train)

In [None]:
y_pred = cat_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Model Tuning

In [None]:
catb_params = {
    'iterations': [200,500],
    'learning_rate': [0.01,0.05, 0.1],
    'depth': [3,5,8] }

In [None]:
catb = CatBoostClassifier()
catb_cv_model = GridSearchCV(catb, catb_params, cv=5, n_jobs = -1, verbose = 2)
catb_cv_model.fit(X_train, y_train)
catb_cv_model.best_params_

In [None]:
catb_cv_model.best_params_

In [None]:
catb = CatBoostClassifier(iterations = 200, 
                          learning_rate = 0.05, 
                          depth = 5)

catb_tuned = catb.fit(X_train, y_train)
y_pred = catb_tuned.predict(X_test)

In [None]:
y_pred = catb_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

# Tüm Modellerin Karşılaştırılması

In [None]:
modeller = [
    knn_tuned,
    loj_model,
    svc_tuned,
    nb_model,
    mlpc_tuned,
    cart_tuned,
    rf_tuned,
    gbm_tuned,
    catb_tuned,
    lgbm_tuned,
    xgb_tuned
    
]


for model in modeller:
    isimler = model.__class__.__name__
    y_pred = model.predict(X_test)
    dogruluk = accuracy_score(y_test, y_pred)
    print("-"*28)
    print(isimler + ":" )
    print("Accuracy: {:.4%}".format(dogruluk))

In [None]:
sonuc = []

sonuclar = pd.DataFrame(columns= ["Modeller","Accuracy"])

for model in modeller:
    isimler = model.__class__.__name__
    y_pred = model.predict(X_test)
    dogruluk = accuracy_score(y_test, y_pred)    
    sonuc = pd.DataFrame([[isimler, dogruluk*100]], columns= ["Modeller","Accuracy"])
    sonuclar = sonuclar.append(sonuc)
    
    
sns.barplot(x= 'Accuracy', y = 'Modeller', data=sonuclar, color="r")
plt.xlabel('Accuracy %')
plt.title('Modellerin Doğruluk Oranları');    