In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import pickle

In [None]:
data=pd.read_csv('../DataScience/weather_classification_data.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['Weather Type'].value_counts()

In [None]:
data['Weather Type'].unique()

In [None]:
data['Cloud Cover'].unique()

In [None]:
data['Season'].unique()

In [None]:
data['Location'].unique()

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data['Cloud Cover']=data['Cloud Cover'].replace({'partly cloudy':0, 'clear':1, 'overcast':2, 'cloudy':3})
data['Season']=data['Season'].replace({'Winter':0, 'Spring':1, 'Summer':2, 'Autumn':3})
data['Location']=data['Location'].replace({'inland':0, 'mountain':1, 'coastal':2})
data['Weather Type']=data['Weather Type'].replace({'Rainy':0, 'Cloudy':1, 'Sunny':2, 'Snowy':3})

In [None]:
data.head()

In [None]:
def checkOutliers(data, col):
    plt.figure(figsize=(12,4))   # bigger figure

    plt.subplot(1, 2, 1)
    plt.boxplot(data[col])
    plt.title(f"Boxplot of {col}")
    plt.tight_layout(pad=3)      # space from borders

    plt.subplot(1, 2, 2)
    sns.histplot(data=data, x=col, kde=True)
    
    mean = data[col].mean()
    std = data[col].std()

    plt.axvline(mean, color='black')
    plt.axvline(mean + 3*std, color='red', linestyle='--')
    plt.axvline(mean - 3*std, color='red', linestyle='--')
    plt.title(f"Histogram of {col}")

    plt.tight_layout(pad=3)      # space between plots
    plt.show()

In [None]:
def handleOutliers(data,col):
    q3=data[col].quantile(0.75)
    q1=data[col].quantile(0.25)
    iqr=q3-q1
    upper=q3+(1.5*iqr)
    lower=q1-(1.5*iqr)
    data.loc[data[col]>upper,col]=upper
    data.loc[data[col]<lower,col]=lower

In [None]:
print(data.columns)

In [None]:
checkOutliers(data,'Temperature')
checkOutliers(data,'Wind Speed')
checkOutliers(data,'Precipitation (%)')
checkOutliers(data,'Atmospheric Pressure')
checkOutliers(data,'Visibility (km)')

In [None]:
handleOutliers(data,'Temperature')
handleOutliers(data,'Wind Speed')
handleOutliers(data,'Precipitation (%)')
handleOutliers(data,'Atmospheric Pressure')
handleOutliers(data,'Visibility (km)')

In [None]:
checkOutliers(data,'Temperature')
checkOutliers(data,'Wind Speed')
checkOutliers(data,'Precipitation (%)')
checkOutliers(data,'Atmospheric Pressure')
checkOutliers(data,'Visibility (km)')

In [None]:
data.head()

In [None]:
data['Weather Type'].value_counts()

## Feature Engineering

In [None]:
y=data['Weather Type']
data=data.drop('Weather Type',axis=1)

In [None]:
def calculate(data):
    vif=[]
    output=pd.DataFrame()
    output['Attribute']=data.columns
    for i in range(0,data.shape[1]):
        vif.append(variance_inflation_factor(data,i))
    output['vif']=vif
    return output

In [None]:
calculate(data)

In [None]:
data=data.drop('Atmospheric Pressure',axis=1)

In [None]:
calculate(data)

In [None]:
data=data.drop('Humidity',axis=1)

In [None]:
calculate(data)

In [None]:
data.describe()

In [None]:
X=data

In [None]:
X.head()

In [None]:
y.value_counts()

In [None]:
ss=StandardScaler()
X=ss.fit_transform(X)

## Splitting data into training and testing

In [None]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

# KNN

In [None]:

knn=KNeighborsClassifier()

In [None]:
knn.fit(X_train,y_train)

In [None]:
test_pred=knn.predict(X_test)
test_pred.size
# test_pred

In [None]:
accuracy_score(y_test,test_pred)

In [None]:
train_pred=knn.predict(X_train)
train_pred.size
# test_pred

In [None]:
accuracy_score(y_train,train_pred)

In [None]:
y_test.value_counts()

In [None]:
knn=KNeighborsClassifier()
val_result=cross_validate(knn,X_train,y_train)
val_result

In [None]:
train_acc=val_result['test_score'].mean()
print("Training Accuracy:",train_acc)

In [None]:
knn=KNeighborsClassifier()
val_result=cross_validate(knn,X_test,y_test)
val_result

In [None]:
test_acc=val_result['test_score'].mean()
print("Testing Accuracy:",test_acc)

In [None]:
train_acc = []
test_acc = []
k_values = []

for k in range(1, 21, 2):
    k_values.append(k)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    train_pred = knn.predict(X_train)
    test_pred = knn.predict(X_test)
    
    train_acc.append(accuracy_score(y_train, train_pred))
    test_acc.append(accuracy_score(y_test, test_pred))

# Plot Accuracy vs K
plt.figure(figsize=(8,5))
plt.plot(k_values, train_acc, marker='o', label='Train Accuracy')
plt.plot(k_values, test_acc, marker='o', label='Test Accuracy')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


In [None]:
knn_final = KNeighborsClassifier(n_neighbors=5)
knn_final.fit(X_train, y_train)

test_pred = knn_final.predict(X_test)


In [None]:
accuracy_score(y_test,test_pred)

In [None]:

# confusion_matrix(y_test,test_pred)

In [None]:
cm = confusion_matrix(y_test, test_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
print(classification_report(y_test,test_pred))

# Decision tree

In [None]:
X=data

In [None]:
X.head()

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:

dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

In [None]:
test_pred = dt.predict(X_test)

In [None]:
test_pred.shape

In [None]:
print("Testing Accuracy:", accuracy_score(y_test, test_pred))

In [None]:
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Decision Tree Confusion Matrix")
plt.show()


In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)


In [None]:
dt = grid.best_estimator_

test_pred = dt.predict(X_test)

print("Final Accuracy:", accuracy_score(y_test, test_pred))


In [None]:
# plt.figure(figsize=(8,8))
# plot_tree(dt, feature_names=X.columns, class_names=y.unique(), filled=True)
# plt.show()
plt.figure(figsize=(10,10))
plot_tree(dt,feature_names=X.columns,class_names=[str(i) for i in dt.classes_],filled=True)
plt.show()



# RANDOM FOREST

In [None]:
data.head()

In [None]:
X=data


In [None]:
X.head()

In [None]:
X.describe()

In [None]:
y.describe()

In [None]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train.head()

In [None]:
rf=RandomForestClassifier( n_estimators=300,random_state=42,class_weight="balanced")
result=cross_validate(rf,X_train,y_train)
rf.fit(X_train,y_train)

In [None]:
result

In [None]:
result['test_score'].mean()

In [None]:

test_pred=rf.predict(X_test)
print('Accuracy:',accuracy_score(y_test,test_pred))

In [None]:
cm = confusion_matrix(y_test, test_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()


In [None]:
print(classification_report(y_test, test_pred))

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)


In [None]:
rf = grid.best_estimator_

test_pred = rf.predict(X_test)

print("Final Random Forest Accuracy:",
      accuracy_score(y_test, test_pred))


# Logistic regression

In [None]:
X.head()

In [None]:
y.value_counts()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:


lr=LogisticRegression()

In [None]:
# result=cross_validate(lr,X_train,y_train)
lr.fit(X_train,y_train)

In [None]:
test_pred=lr.predict(X_test)

In [None]:
accuracy_score(y_test,test_pred)

In [None]:
result=cross_validate(lr,X_train,y_train)
lr.fit(X_train,y_train)

In [None]:
result

In [None]:
result['test_score'].mean()

In [None]:
test_pred = rf.predict(X_test)
# accuracy_score(y_test, test_pred)
print("Logistic Regression testing Accuracy:", accuracy_score(y_test, test_pred))


In [None]:
probability= lr.predict_proba(X_test)

In [None]:
probability

In [None]:
train_pred=lr.predict(X_train)
accuracy_score(y_train,train_pred)

In [None]:
cm = confusion_matrix(y_test, test_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d',cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Logistic Regression Confusion Matrix")
plt.show()


In [None]:
print(classification_report(y_test,test_pred))

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

grid = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)


In [None]:
lr = grid.best_estimator_

y_pred_final = lr.predict(X_test)

print("Final Logistic Regression Accuracy:",
      accuracy_score(y_test, test_pred))


# SVM

In [None]:
X.head()

In [None]:
y.value_counts()

In [None]:
calculate(X)

In [None]:
y.unique()

In [None]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:

model=SVC()
model.fit(X_train,y_train)

In [None]:
train_pred=model.predict(X_train)

In [None]:
print("SVM Training Accuracy:",
      accuracy_score(y_train, train_pred))

In [None]:
test_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("SVM Accuracy:", accuracy_score(y_test, test_pred))

In [None]:
cm = confusion_matrix(y_test, test_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("SVM Confusion Matrix")
plt.show()


In [None]:
print(classification_report(y_test, test_pred))

In [None]:
param_grid = {
    # 'C': [0.1, 1, 10],
    # 'gamma': ['scale', 0.1, 0.01],
    'kernel': ['rbf', 'linear','poly']
}

grid = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)

In [None]:
model = grid.best_estimator_

test_pred = model.predict(X_test)

print("Final SVM Accuracy:",
      accuracy_score(y_test, test_pred))


In [None]:
svm = SVC(kernel='rbf')
result_svm = cross_validate(svm, X_train, y_train)
svm.fit(X_train, y_train)

In [None]:
test_pred = model.predict(X_test)
print("Final SVM Accuracy:",
      accuracy_score(y_test, test_pred))

In [None]:
train_pred=model.predict(X_train)

In [None]:
print("Final SVM Training Accuracy:",
      accuracy_score(y_train, train_pred))