In [123]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Veriyi okuma, yapısını inceleme , değerleri düzenleme

In [None]:
df = pd.read_csv("weather_classification_data.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
print("Humidity > 100:", (df['Humidity'] > 100).sum())
print("Temperature > 60:", (df['Temperature'] > 60).sum())
print("Precipitation (%) > 100:", (df['Precipitation (%)'] > 100).sum())

In [None]:
for col in df.select_dtypes(include='object').columns:
    print(f"\n{col} değer dağılımı:")
    print(df[col].value_counts())

In [58]:
df[['Humidity', 'Precipitation (%)', 'Temperature']] = df[['Humidity', 'Precipitation (%)', 'Temperature']].clip(upper=[100, 100, 60])

In [None]:
df.describe().T

# Encoding işlemi

In [63]:
df_encoded = pd.get_dummies(df, columns=['Cloud Cover', 'Season', 'Location'], drop_first=False)

In [64]:
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'bool':
        df_encoded[col] = df_encoded[col].astype(int)

In [None]:
df_encoded.info()

In [None]:
df_encoded

# Model eğitimi için hazırlık ve model eğitimleri

In [76]:
X = df_encoded.drop(columns=['Weather Type'])
y = df_encoded['Weather Type']

In [77]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [115]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.3,       # %30 test
    random_state=42,     # aynı sonucu almak için sabit tohum
    stratify=y_encoded   # sınıf dengesini koru
)

# DecisionTree

In [107]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [108]:
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt, target_names=le.classes_))

Decision Tree Accuracy: 0.906060606060606
              precision    recall  f1-score   support

      Cloudy       0.88      0.90      0.89       990
       Rainy       0.89      0.90      0.89       990
       Snowy       0.95      0.91      0.93       990
       Sunny       0.91      0.91      0.91       990

    accuracy                           0.91      3960
   macro avg       0.91      0.91      0.91      3960
weighted avg       0.91      0.91      0.91      3960



# RandomForest

In [109]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [110]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

Random Forest Accuracy: 0.9128787878787878
              precision    recall  f1-score   support

      Cloudy       0.86      0.92      0.89       990
       Rainy       0.90      0.91      0.91       990
       Snowy       0.94      0.91      0.93       990
       Sunny       0.95      0.91      0.93       990

    accuracy                           0.91      3960
   macro avg       0.91      0.91      0.91      3960
weighted avg       0.91      0.91      0.91      3960



# Logistic Regression

In [111]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [112]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded,
    test_size=0.3,
    random_state=42,
    stratify=y_encoded
)

In [113]:
lr_model = LogisticRegression(max_iter=1500)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [114]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))

Logistic Regression Accuracy: 0.8689393939393939
              precision    recall  f1-score   support

      Cloudy       0.82      0.83      0.83       990
       Rainy       0.85      0.85      0.85       990
       Snowy       0.89      0.92      0.90       990
       Sunny       0.92      0.87      0.89       990

    accuracy                           0.87      3960
   macro avg       0.87      0.87      0.87      3960
weighted avg       0.87      0.87      0.87      3960



# Naive Bayes

In [146]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [147]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded,
    test_size=0.3,
    random_state=42,
    stratify=y_encoded
)

In [148]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [149]:
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=le.classes_))

Naive Bayes Accuracy: 0.7800505050505051
              precision    recall  f1-score   support

      Cloudy       0.64      0.80      0.71       990
       Rainy       0.79      0.76      0.78       990
       Snowy       0.80      0.91      0.85       990
       Sunny       1.00      0.65      0.79       990

    accuracy                           0.78      3960
   macro avg       0.81      0.78      0.78      3960
weighted avg       0.81      0.78      0.78      3960



# Gradient Boosted Tree

In [118]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

In [120]:
print("Gradient Boosted Tree Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb, target_names=le.classes_))

Gradient Boosted Tree Accuracy: 0.9148989898989899
              precision    recall  f1-score   support

      Cloudy       0.89      0.91      0.90       990
       Rainy       0.90      0.92      0.91       990
       Snowy       0.95      0.92      0.93       990
       Sunny       0.93      0.91      0.92       990

    accuracy                           0.91      3960
   macro avg       0.92      0.91      0.92      3960
weighted avg       0.92      0.91      0.92      3960



# XGBoost

In [121]:
xgb_model = XGBClassifier( eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [122]:
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))

XGBoost Accuracy: 0.9141414141414141
              precision    recall  f1-score   support

      Cloudy       0.88      0.91      0.90       990
       Rainy       0.91      0.91      0.91       990
       Snowy       0.95      0.91      0.93       990
       Sunny       0.92      0.92      0.92       990

    accuracy                           0.91      3960
   macro avg       0.91      0.91      0.91      3960
weighted avg       0.91      0.91      0.91      3960



# XGBoost hiperparametre optimizasyonu

In [124]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [127]:
xgb_clf = XGBClassifier(objective='multi:softmax', random_state=42, eval_metric='mlogloss')

In [128]:
grid_search_xgb = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

In [129]:
grid_search_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'colsample_bytree': [0.8, 1.0], 'learning_rate': [0.01, 0.1, ...], 'max_depth': [3, 5, ...], 'n_estimators': [100, 200], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [139]:
print("En iyi parametreler:", grid_search_xgb.best_params_)
print("En iyi doğruluk:", grid_search_xgb.best_score_)

En iyi parametreler: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
En iyi doğruluk: 0.9167748917748918


In [140]:
best_xgb = XGBClassifier(
    colsample_bytree=1.0,
    learning_rate=0.01,
    max_depth=7,
    n_estimators=200,
    subsample=1.0,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42
)

In [141]:
best_xgb.fit(X_train, y_train)
y_pred_best_xgb = best_xgb.predict(X_test)

In [142]:
print("Final XGBoost Accuracy:", accuracy_score(y_test, y_pred_best_xgb))
print(classification_report(y_test, y_pred_best_xgb, target_names=le.classes_))

Final XGBoost Accuracy: 0.9143939393939394
              precision    recall  f1-score   support

      Cloudy       0.88      0.91      0.90       990
       Rainy       0.90      0.92      0.91       990
       Snowy       0.95      0.92      0.93       990
       Sunny       0.94      0.91      0.92       990

    accuracy                           0.91      3960
   macro avg       0.92      0.91      0.91      3960
weighted avg       0.92      0.91      0.91      3960



# Gradient Boosting Tree hiperparametre optimizasyonu

In [134]:
param_grid_gbt = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


In [135]:
gb_model = GradientBoostingClassifier(random_state=42)

In [136]:
grid_search_gbt = GridSearchCV(
    estimator=gb_model,
    param_grid=param_grid_gbt,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [137]:
grid_search_gbt.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


0,1,2
,estimator,GradientBoost...ndom_state=42)
,param_grid,"{'learning_rate': [0.05, 0.1, ...], 'max_depth': [3, 4, ...], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,learning_rate,0.2
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


In [138]:
print("En iyi parametreler:", grid_search_gbt.best_params_)
print("En iyi doğruluk (CV):", grid_search_gbt.best_score_)

En iyi parametreler: {'learning_rate': 0.2, 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
En iyi doğruluk (CV): 0.9158008658008657


In [143]:
best_gbt = GradientBoostingClassifier(
    learning_rate=0.2,
    max_depth=4,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

In [144]:
best_gbt.fit(X_train, y_train)
y_pred_best_gbt = best_gbt.predict(X_test)

In [145]:
print("Final GBT Accuracy:", accuracy_score(y_test, y_pred_best_gbt))
print(classification_report(y_test, y_pred_best_gbt, target_names=le.classes_))

Final GBT Accuracy: 0.9141414141414141
              precision    recall  f1-score   support

      Cloudy       0.88      0.91      0.89       990
       Rainy       0.90      0.91      0.91       990
       Snowy       0.96      0.92      0.94       990
       Sunny       0.92      0.92      0.92       990

    accuracy                           0.91      3960
   macro avg       0.91      0.91      0.91      3960
weighted avg       0.91      0.91      0.91      3960



# Random Forest hiperparametre optimizasyonu

In [152]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

In [153]:
rf = RandomForestClassifier(random_state=42)

In [154]:
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [155]:
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [156]:
print("En iyi parametreler:", grid_search_rf.best_params_)
print("En iyi doğruluk (CV):", grid_search_rf.best_score_)

En iyi parametreler: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
En iyi doğruluk (CV): 0.9172077922077921


In [158]:
best_rf = RandomForestClassifier(
    max_depth=10,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

In [159]:
best_rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [160]:
y_pred_best_rf = best_rf.predict(X_test)

In [161]:
print("Final Tuned RF Accuracy:", accuracy_score(y_test, y_pred_best_rf))
print(classification_report(y_test, y_pred_best_rf, target_names=le.classes_))

Final Tuned RF Accuracy: 0.9098484848484848
              precision    recall  f1-score   support

      Cloudy       0.85      0.91      0.88       990
       Rainy       0.89      0.92      0.90       990
       Snowy       0.96      0.91      0.94       990
       Sunny       0.94      0.90      0.92       990

    accuracy                           0.91      3960
   macro avg       0.91      0.91      0.91      3960
weighted avg       0.91      0.91      0.91      3960

