In [None]:
# Makine öğrenmesi, derin öğrenme ve veri analizi için temel kütüphaneler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Makine öğrenmesi modelleri ve metrikleri
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve

from sklearn.ensemble import RandomForestClassifier

# Derin öğrenme için
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Uyarıları kapatmak için (isteğe bağlı)
import warnings
warnings.filterwarnings('ignore')

print("Tüm kütüphaneler başarıyla yüklendi!")


In [None]:
import pandas as pd

df = pd.read_csv("rayl-sistemler-istasyon-bazl-yolcu-ve-yolculuk-saylar-2024.csv")
print(df.head())
print(df.info())


In [None]:
# 1. Gereksiz sütun var mı kontrol et
print(df.columns)

# 2. Eksik veri kontrolü
print(df.isnull().sum())

# 3. Kategorik değişkenleri LabelEncoder ile sayısala çevir (line, station_name, station_number, town)
categorical_cols = ['line', 'station_name', 'station_number', 'town']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

# 4. Gerekirse longitude ve latitude bırakılabilir (modelde etkili olabilir)
# 5. Hedef değişkeni (busy_station) oluştur: ortalamanın %70'i üstü YOĞUN (1), aksi halde AZ YOĞUN (0)
threshold = df['passanger_cnt'].mean() * 0.7
df['busy_station'] = (df['passanger_cnt'] > threshold).astype(int)

print(df[['passanger_cnt', 'busy_station']].head(10))
print(df['busy_station'].value_counts())


In [None]:
# Özellik sütunları (hedef ve kimlik sütunları hariç)
features = ['transaction_year', 'transaction_month', 'line', 'station_name',
            'station_number', 'town', 'longitude', 'latitude', 'passage_cnt']

X = df[features]
y = df['busy_station']

# Sayısal verileri standartlaştır (özellikle DL için önemli)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Eğitim ve test verisine ayır (test_size=0.2 yani %20 test, %80 eğitim)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


In [None]:
# Random Forest Classifier ile model kur, eğit ve test et
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Tahminleri al
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]  # ROC-AUC için

# Skorları hesapla
rf_acc = accuracy_score(y_test, y_pred_rf)
rf_prec = precision_score(y_test, y_pred_rf)
rf_rec = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, y_proba_rf)

print("Random Forest Scores:")
print(f"Accuracy: {rf_acc:.3f}")
print(f"Precision: {rf_prec:.3f}")
print(f"Recall: {rf_rec:.3f}")
print(f"F1 Score: {rf_f1:.3f}")
print(f"ROC-AUC: {rf_auc:.3f}")


In [None]:
# Eksik değerleri ortalama ile doldur
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)   # Bu X'i standartize etmek için scale edeceğiz

# Tekrar bölüyoruz (çünkü X bir numpy array oldu)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Şimdi scale et
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)

y_pred_lr = logreg_model.predict(X_test)
y_proba_lr = logreg_model.predict_proba(X_test)[:, 1]

lr_acc = accuracy_score(y_test, y_pred_lr)
lr_prec = precision_score(y_test, y_pred_lr)
lr_rec = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_proba_lr)

print("Logistic Regression Scores:")
print(f"Accuracy: {lr_acc:.3f}")
print(f"Precision: {lr_prec:.3f}")
print(f"Recall: {lr_rec:.3f}")
print(f"F1 Score: {lr_f1:.3f}")
print(f"ROC-AUC: {lr_auc:.3f}")


In [None]:
!pip install xgboost

from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_prec = precision_score(y_test, y_pred_xgb)
xgb_rec = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_auc = roc_auc_score(y_test, y_proba_xgb)

print("XGBoost Scores:")
print(f"Accuracy: {xgb_acc:.3f}")
print(f"Precision: {xgb_prec:.3f}")
print(f"Recall: {xgb_rec:.3f}")
print(f"F1 Score: {xgb_f1:.3f}")
print(f"ROC-AUC: {xgb_auc:.3f}")


In [None]:
import pandas as pd

# Tüm skorları bir tabloya topla
results = {
    "Model": ["Random Forest", "Logistic Regression", "XGBoost"],
    "Accuracy": [rf_acc, lr_acc, xgb_acc],
    "Precision": [rf_prec, lr_prec, xgb_prec],
    "Recall": [rf_rec, lr_rec, xgb_rec],
    "F1 Score": [rf_f1, lr_f1, xgb_f1],
    "ROC-AUC": [rf_auc, lr_auc, xgb_auc]
}

results_df = pd.DataFrame(results)
print("Model Karşılaştırma Tablosu:\n")
print(results_df)


In [None]:
plt.figure(figsize=(8, 6))

# Random Forest
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {rf_auc:.2f})')

# Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {lr_auc:.2f})')

# XGBoost
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_proba_xgb)
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {xgb_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Karşılaştırması')
plt.legend(loc='lower right')
plt.grid()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

model_names = ['Random Forest', 'Logistic Regression', 'XGBoost']
y_preds = [y_pred_rf, y_pred_lr, y_pred_xgb]

for name, y_pred in zip(model_names, y_preds):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Tahmin")
    plt.ylabel("Gerçek")
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"]
palette = sns.color_palette("mako", len(results_df))  # Güzel bir renk paleti

plt.figure(figsize=(18, 4))

for idx, metric in enumerate(metrics):
    plt.subplot(1, len(metrics), idx+1)
    ax = sns.barplot(
        x="Model",
        y=metric,
        data=results_df,
        palette=palette,
        edgecolor="black",
        linewidth=2
    )
    plt.title(metric, fontsize=15, fontweight='bold')
    plt.ylim(0, 1)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks(rotation=20, fontsize=11)
    # Barların üstüne değer etiketi ekle
    for p in ax.patches:
        ax.annotate(
            f"{p.get_height():.2f}",
            (p.get_x() + p.get_width() / 2, p.get_height()),
            ha='center',
            va='bottom',
            fontsize=11,
            fontweight='bold',
            color='black'
        )
    # Kenarlık ve gridleri sadeleştir
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_linewidth(1.5)
    ax.grid(axis='y', linestyle='--', linewidth=0.7, alpha=0.6)

plt.tight_layout()
plt.suptitle("Modellerin Karşılaştırmalı Performans Metrikleri", y=1.07, fontsize=18, fontweight="bold")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x="Model", y="F1 Score", data=results_df)
plt.title("F1 Score Karşılaştırması")
plt.ylabel("F1 Score")
plt.ylim(0, 1)
plt.show()


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Modeli kur
dl_model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Derleme
dl_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

# Eğitimi başlat
history = dl_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=32,
    verbose=1
)


In [None]:
# Test verisinde tahmin yap
y_pred_dl_proba = dl_model.predict(X_test).flatten()
y_pred_dl = (y_pred_dl_proba > 0.5).astype(int)

# Metrikleri hesapla
dl_acc = accuracy_score(y_test, y_pred_dl)
dl_prec = precision_score(y_test, y_pred_dl)
dl_rec = recall_score(y_test, y_pred_dl)
dl_f1 = f1_score(y_test, y_pred_dl)
dl_auc = roc_auc_score(y_test, y_pred_dl_proba)

print("Dense Neural Network Scores:")
print(f"Accuracy: {dl_acc:.3f}")
print(f"Precision: {dl_prec:.3f}")
print(f"Recall: {dl_rec:.3f}")
print(f"F1 Score: {dl_f1:.3f}")
print(f"ROC-AUC: {dl_auc:.3f}")


In [None]:
import pandas as pd

# Eklenecek satırı DataFrame olarak oluştur
new_row = pd.DataFrame([{
    "Model": "Dense Neural Network",
    "Accuracy": dl_acc,
    "Precision": dl_prec,
    "Recall": dl_rec,
    "F1 Score": dl_f1,
    "ROC-AUC": dl_auc
}])

# Sonra concat ile birleştir
results_df = pd.concat([results_df, new_row], ignore_index=True)

print("Tüm Modellerin Karşılaştırma Tablosu:\n")
print(results_df)


In [None]:
import matplotlib.pyplot as plt

# Model sütununu index yapalım
results_plot = results_df.set_index("Model")

# Tüm metrikleri tek grafikte çizelim (her model için tüm metrikler)
ax = results_plot.plot(kind="bar", figsize=(12, 6))

plt.title("Tüm Modellerin Karşılaştırmalı Performansı")
plt.ylabel("Skor")
plt.xlabel("Model")
plt.xticks(rotation=20)
plt.legend(title="Metrik", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
fpr_dl, tpr_dl, _ = roc_curve(y_test, y_pred_dl_proba)
plt.figure(figsize=(8,6))
plt.plot(fpr_dl, tpr_dl, label=f'Dense Neural Network (AUC = {dl_auc:.2f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Dense Neural Network")
plt.legend()
plt.show()


In [None]:
cm_dl = confusion_matrix(y_test, y_pred_dl)
plt.figure(figsize=(4,3))
sns.heatmap(cm_dl, annot=True, fmt="d", cmap="Blues")
plt.title("Dense Neural Network - Confusion Matrix")
plt.xlabel("Tahmin")
plt.ylabel("Gerçek")
plt.show()


In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss (Eğitim vs Validasyon)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Accuracy (Eğitim vs Validasyon)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8,4))
sns.kdeplot(df["passanger_cnt"], fill=True)
plt.title("passanger_cnt Yoğunluk Grafiği (KDE)")
plt.xlabel("passanger_cnt")
plt.show()


In [None]:
plt.figure(figsize=(12,4))
sns.countplot(x="town", data=df)
plt.xticks(rotation=90)
plt.title("İlçelere Göre Kayıt Sayısı")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sc = plt.scatter(df["longitude"], df["latitude"], c=df["passanger_cnt"], cmap="hot", alpha=0.6, s=60)
plt.colorbar(sc, label="Yolcu Sayısı")
plt.title("İstanbul Raylı Sistem Yolcu Yoğunluğu (Coğrafi Harita)")
plt.xlabel("Boylam")
plt.ylabel("Enlem")
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Korelasyon Isı Haritası")
plt.show()



In [None]:
import plotly.express as px

# 1. Hiyerarşik grafikte kullanılacak tüm kategorik sütunlarda NaN varsa at:
df_sun = df.dropna(subset=['line', 'town', 'station_name', 'passanger_cnt'])

# 2. Sunburst (Hat → İlçe → İstasyon)
fig = px.sunburst(
    df_sun,
    path=['line', 'town', 'station_name'],
    values='passanger_cnt',
    title='Hat → İlçe → İstasyonlara Göre Yolcu Payı'
)
fig.show()

# 3. Treemap (Hat → İstasyon)
fig2 = px.treemap(
    df_sun,
    path=['line', 'station_name'],
    values='passanger_cnt',
    title='Hat ve İstasyonlara Göre Yolcu Payı'
)
fig2.show()


In [None]:
import seaborn as sns
plt.figure(figsize=(8,6))
sns.violinplot(x="town", y="passanger_cnt", data=df)
plt.xticks(rotation=90)
plt.title("İlçelere Göre Yolcu Sayısı Dağılımı (Violinplot)")
plt.show()


In [None]:
top_stations = df.groupby("station_name")["passanger_cnt"].sum().sort_values(ascending=False).head(10)
top_stations.plot(kind='barh', figsize=(8,5), color='teal')
plt.title("En Çok Yolcu Taşıyan 10 İstasyon")
plt.xlabel("Toplam Yolcu Sayısı")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
town_stats = df.groupby("town")["passanger_cnt"].agg(["sum", "mean"]).sort_values("sum", ascending=False)
town_stats["sum"].plot(kind='bar', figsize=(10,5), color='orange')
plt.title("İlçelere Göre Toplam Yolcu Sayısı")
plt.ylabel("Yolcu")
plt.tight_layout()
plt.show()

town_stats["mean"].plot(kind='bar', figsize=(10,5), color='navy')
plt.title("İlçelere Göre Ortalama Yolcu Sayısı")
plt.ylabel("Ortalama Yolcu")
plt.tight_layout()
plt.show()


In [None]:
pivot = pd.pivot_table(df, values='passanger_cnt', index='line', columns='town', aggfunc='sum', fill_value=0)
plt.figure(figsize=(14,8))
sns.heatmap(pivot, annot=False, cmap='YlGnBu')
plt.title("Hat ve İlçelere Göre Toplam Yolcu (Heatmap)")
plt.xlabel("İlçe")
plt.ylabel("Hat")
plt.tight_layout()
plt.show()
