In [None]:
# üå¶Ô∏è Weather Data Analysis & Temperature Forecasting
A complete exploratory data analysis (EDA) and machine learning prediction project.

## üìå This project includes:
- Data cleaning  
- Handling missing values  
- Fixing unrealistic values  
- EDA: monthly averages, seasonal trends  
- Correlation analysis  
- Extreme weather detection  
- Temperature forecasting using Linear Regression  
- Beautiful visualizations  

In [None]:
### üìå Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [None]:
### üìå Step 2: Load Dataset & Basic Info

In [None]:
weather_df = pd.read_csv('weatherHistory.csv')

print(weather_df.shape)
weather_df.info()

In [None]:
### üìå Step 3: Check Missing Values

In [None]:
def missing_values(df):
    missing_col = []
    for values in df.columns:
        missing = df[values].isna().sum()
        if missing != 0:
            missing_col.append(values)
            print(f"{values} has {missing} missing values")
    return missing_col

missing_values(weather_df)

In [None]:
### üìå Step 4: Handle Missing Data

In [None]:
weather_df['Precip Type'] = weather_df['Precip Type'].fillna('none')

In [None]:
### üìå Step 5: Fix Incorrect Temperature Values

In [None]:
def clean_incorrect_temperature(df, column='Temperature (C)', min_val=-50, max_val=60):
    mean_temp = df[column].mean()
    count = df[(df[column] < min_val) | (df[column] > max_val)].shape[0]
    df.loc[(df[column] < min_val) | (df[column] > max_val), column] = mean_temp
    print(f"{count} unrealistic temperatures replaced with mean ({mean_temp:.2f})")

clean_incorrect_temperature(weather_df)

In [None]:
### üìå Step 6: Fix Incorrect Wind Speed Values

In [None]:
def clean_incorrect_windspeed(df, column='Wind Speed (km/h)', min_val=0, max_val=200):
    mean_speed = df[column].mean()
    count = df[(df[column] < min_val) | (df[column] > max_val)].shape[0]
    df.loc[(df[column] < min_val) | (df[column] > max_val), column] = mean_speed
    print(f"{count} unrealistic wind speeds replaced with mean ({mean_speed:.2f})")

clean_incorrect_windspeed(weather_df)

In [None]:
### üìå Step 7: Average Temperature by Month

In [None]:
def avg_temp_by_month(df):
    df = df.copy()
    df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], utc=True)
    df['Month'] = df['Formatted Date'].dt.month
    return df.groupby('Month')['Temperature (C)'].mean()

avg_temp_by_month(weather_df)

In [None]:
### üìå Step 8: Average Temperature by Year

In [None]:
def avg_temp_by_year(df):
    df = df.copy()
    df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], utc=True)
    df['Year'] = df['Formatted Date'].dt.year
    return df.groupby('Year')['Temperature (C)'].mean()

avg_temp_by_year(weather_df)

In [None]:
### üìå Step 9: Correlation (Humidity vs Temperature)

In [None]:
correlation = weather_df['Temperature (C)'].corr(weather_df['Humidity'])
print("Correlation:", correlation)

In [None]:
### üìå Step 10: Identify Extreme Weather Conditions

In [None]:
class ExtremeWeather:
    def __init__(self, df):
        self.df = df

    def extreme_temperature(self):
        ext = self.df[(self.df['Temperature (C)'] < -20) | (self.df['Temperature (C)'] > 40)]
        return ext

    def extreme_windspeed(self):
        ext = self.df[self.df['Wind Speed (km/h)'] > 100]
        return ext

    def extreme_humidity(self):
        ext = self.df[(self.df['Humidity'] < 10) | (self.df['Humidity'] > 95)]
        return ext

    def extreme_pressure(self):
        ext = self.df[(self.df['Pressure (millibars)'] < 1000) | (self.df['Pressure (millibars)'] > 1030)]
        return ext

    def poor_visibility(self):
        ext = self.df[self.df['Visibility (km)'] < 1]
        return ext

ext = ExtremeWeather(weather_df)
ext.extreme_temperature().head()

In [None]:
### üìå Step 11: Temperature Trend (Daily / Weekly / Monthly)

In [None]:
def temp_trend(df, trend=['d','w','m']):
    df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], utc=True)
    choice = 'd'  # set default for notebook
    
    avg = df.resample(choice, on='Formatted Date')['Temperature (C)'].mean()
    plt.figure(figsize=(10,5))
    plt.plot(avg.index, avg.values)
    plt.title("Average Temperature Trend")
    plt.xlabel("Date")
    plt.ylabel("Temperature")
    plt.grid(True)
    plt.show()

In [None]:
### üìå Step 12: Precipitation Type Distribution (Histogram)

In [None]:
weather_df['Precip Type'].value_counts().plot(kind='bar', color='skyblue')
plt.title("Distribution of Precipitation Types")
plt.xlabel("Type")
plt.ylabel("Count")
plt.show()

In [None]:
### üìå Step 13: Boxplot: Temperature by Season

In [None]:
def get_season(month):
    if month in [12,1,2]: return 'Winter'
    if month in [3,4,5]: return 'Summer'
    if month in [6,7,8,9]: return 'Monsoon'
    return 'Autumn'

weather_df['Formatted Date'] = pd.to_datetime(weather_df['Formatted Date'], utc=True)
weather_df['Season'] = weather_df['Formatted Date'].dt.month.map(get_season)

sns.boxplot(x='Season', y='Temperature (C)', data=weather_df)
plt.title("Temperature Variation per Season")
plt.show()

In [None]:
### üìå Step 14: Scatter Plot (Humidity vs Temperature)

In [None]:
sns.scatterplot(x='Temperature (C)', y='Humidity', data=weather_df, alpha=0.5)
plt.title("Humidity vs Temperature")
plt.grid(True)
plt.show()

In [None]:
### üìå Step 15: Prepare Data for Temperature Forecasting

In [None]:
df = weather_df.copy()
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'])
df = df.groupby('Formatted Date')['Temperature (C)'].mean().reset_index()

# Create numeric feature (days)
df['Day'] = (df['Formatted Date'] - df['Formatted Date'].min()).dt.days

X = df[['Day']]
y = df['Temperature (C)']


In [None]:
### üìå Step 16: Train Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X, y)

print("Model trained successfully!")

In [None]:
### üìå Step 17: Predict Next 30 Days

In [None]:
last_day = df['Day'].max()
future_days = np.arange(last_day+1, last_day+31).reshape(-1,1)

pred_temp = model.predict(future_days)

In [None]:
### üìå Step 18: Plot Temperature Forecast

In [None]:
plt.figure(figsize=(12,5))
plt.plot(df['Formatted Date'], df['Temperature (C)'], label='Actual Temperature')

future_dates = df['Formatted Date'].max() + pd.to_timedelta(np.arange(1,31), unit='D')

plt.plot(future_dates, pred_temp, '--', label='Predicted Temperature', color='red')

plt.title("Temperature Forecast for Next 30 Days")
plt.xlabel("Date")
plt.ylabel("Temperature (C)")
plt.legend()
plt.grid(True)
plt.show()
