# Weather Condition Categorization 

In [3]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models and tools
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('/kaggle/input/weather-type-classification/weather_classification_data.csv')

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
print("\nSummary statistics of numerical variables:")
data.describe()

In [None]:
# Summary of categorical variables
print("\nSummary of categorical variables:")
data.describe(include=['object', 'category'])

In [None]:
# Visual exploration: histograms for numerical variables
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
sns.histplot(data['Temperature'], bins=20, kde=True, color='#835C3B', alpha=0.7)
plt.title('Distribution of Temperature')
plt.xlabel('Temperature (Celsius)')
plt.ylabel('Count')

plt.subplot(2, 2, 2)
sns.histplot(data['Humidity'], bins=20, kde=True, color='#3F000F', alpha=0.7)
plt.title('Distribution of Humidity')
plt.xlabel('Humidity (%)')
plt.ylabel('Count')

plt.subplot(2, 2, 3)
sns.histplot(data['Wind Speed'], bins=20, kde=True, color='#1F6357', alpha=0.7)
plt.title('Distribution of Wind Speed')
plt.xlabel('Wind Speed (km/h)')
plt.ylabel('Count')

plt.subplot(2, 2, 4)
sns.histplot(data['Precipitation (%)'], bins=20, kde=True, color='#3C565B', alpha=0.7)
plt.title('Distribution of Precipitation')
plt.xlabel('Precipitation (%)')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Visual exploration: count plots for categorical variables
plt.figure(figsize=(16, 10))

plt.subplot(2, 2, 1)
sns.countplot(x='Cloud Cover', data=data, palette='Set2')
plt.title('Count of Cloud Cover')

plt.subplot(2, 2, 2)
sns.countplot(x='Season', data=data, palette='Set1')
plt.title('Count of Season')

plt.subplot(2, 2, 3)
sns.countplot(x='Location', data=data, palette='Set3')
plt.title('Count of Location')

plt.subplot(2, 2, 4)
sns.countplot(x='Weather Type', data=data, palette='Pastel1')
plt.title('Count of Weather Type')

plt.tight_layout()
plt.show()

In [None]:
data.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# Separate features and target variable
X = data.drop('Weather Type', axis=1)
y = data['Weather Type']

In [None]:
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define categorical features for label encoding
categorical_features = ['Cloud Cover', 'Season', 'Location']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical feature
for feature in categorical_features:
    X_train[feature] = label_encoder.fit_transform(X_train[feature])
    X_test[feature] = label_encoder.transform(X_test[feature])

In [None]:
# Print preprocessed data sample
print("Sample of X_train after label encoding:")
X_train.head()

In [None]:
# Define numerical features for standard scaling
numeric_features = ['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)', 'Atmospheric Pressure', 'UV Index', 'Visibility (km)']

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the training data with StandardScaler
X_train_scaled = X_train.copy()  
X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])

In [None]:
# Transform the test data using the fitted scaler
X_test_scaled = X_test.copy() 
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

# Print preprocessed data sample
print("Sample of X_train after numerical feature scaling:")
X_train_scaled.head()

In [None]:
# Separate numerical features for correlation analysis
numerical_features = ['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)', 'Atmospheric Pressure', 'UV Index', 'Visibility (km)']

# Compute the correlation matrix for numerical features
corr_matrix = data[numerical_features].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(8,6))

# Plot the heatmap
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='copper', fmt='.2f', linewidths=0.5)

plt.title('Correlation Matrix Heatmap (Numerical Features)')
plt.show()

In [None]:
# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

In [None]:
# Train and evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"Training {clf_name}...")
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Print classification report for detailed metrics
    f"\nClassification Report for {clf_name}:"
    print(classification_report(y_test, y_pred))

    print("="*80) 

In [None]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train_scaled, y_train)

# Feature importance
importance = rf_classifier.feature_importances_
feature_names = X.columns

# Create a DataFrame to visualize feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importance
print("Feature Importance (Random Forest):")
feature_importance_df

In [None]:
# Plotting feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='copper')
plt.title('Feature Importance - Random Forest Classifier')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()