In [115]:
# Importing necessary libraries for the project

# NumPy is used for Linear Algebra operations and numerical computations.
import numpy as np
# pandas is used to work with data in tabular format, making it easier to manipulate and analyze data.
import pandas as pd
# For visualizations, we use Plotly as it creates interesting and interactive visualizations that can be easily customized.
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# The datetime library is used for Time Series analysis.
from datetime import datetime

# The LinearRegression class from scikit-learn is used for building a linear regression model.
# In this project, linear regression is used to analyze temperature trends over time.
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [116]:
df = pd.read_csv('/content/drive/MyDrive/projects/Weather Data in India from 1901 to 2017.csv')

In [117]:
df.head() ## This will show us top 5 rows of the dataset by default.

Unnamed: 0.1,Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,0,1901,17.99,19.43,23.49,26.41,28.28,28.6,27.49,26.98,26.26,25.08,21.73,18.95
1,1,1902,19.0,20.39,24.1,26.54,28.68,28.44,27.29,27.05,25.95,24.37,21.33,18.78
2,2,1903,18.32,19.79,22.46,26.03,27.93,28.41,28.04,26.63,26.34,24.57,20.96,18.29
3,3,1904,17.77,19.39,22.95,26.73,27.83,27.85,26.84,26.73,25.84,24.36,21.07,18.84
4,4,1905,17.4,17.79,21.78,24.84,28.32,28.69,27.67,27.47,26.29,26.16,22.07,18.71


Now we will Melt data as Melting the data using pd.melt() was necessary to transform the original wide-format dataset into a long-format or tidy format.

In [118]:
df1 = pd.melt(df, id_vars='YEAR', value_vars=df.columns[1:]) ## This will melt the data
df1.head() ## This is how the new data looks now:

Unnamed: 0,YEAR,variable,value
0,1901,JAN,17.99
1,1902,JAN,19.0
2,1903,JAN,18.32
3,1904,JAN,17.77
4,1905,JAN,17.4


In [119]:
print(df1.dtypes)

YEAR          int64
variable     object
value       float64
dtype: object


In [120]:
# Combining the 'variable' (Month) and 'YEAR' columns to create a new 'Date' column.
# This is done to convert the separate 'Month' and 'YEAR' columns into a single datetime representation.
df1['Date'] = df1['variable'] + ' ' + df1['YEAR'].astype(str)

# Converting the 'Date' column from string format to a datetime object.
# This step is essential for performing Time Series analysis, as it allows us to work with date-related functionalities.
df1.loc[:, 'Date'] = df1['Date'].apply(lambda x: datetime.strptime(x, '%b %Y'))

# Displaying the first few rows of the DataFrame after the Date conversion.
df1.head()


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`



Unnamed: 0,YEAR,variable,value,Date
0,1901,JAN,17.99,1901-01-01
1,1902,JAN,19.0,1902-01-01
2,1903,JAN,18.32,1903-01-01
3,1904,JAN,17.77,1904-01-01
4,1905,JAN,17.4,1905-01-01



# Removing NULL Values and Temprature Throught Timeline

In [121]:
df1.columns=['Year', 'Month', 'Temprature', 'Date'] #This line is renaming the columns to 'Year', 'Month', 'Temprature', and 'Date' to make them easier to read.
df1.sort_values(by='Date', inplace=True) #This line is sorting by the 'Date' column in ascending order.
#This is to ensure that the time series is plotted correctly in the next step.
fig = go.Figure(layout = go.Layout(yaxis=dict(range=[0, df1['Temprature'].max()+1])))#This line is creating a new plotly Figure object called "fig" and setting the range of the y-axis to be from 0 to the maximum temperature value of the 'Temperature' column in "df1" plus 1.
fig.add_trace(go.Scatter(x=df1['Date'], y=df1['Temprature']), )#This line is adding a new trace (i.e. line) to the plot "fig" with x-values from the 'Date' column and y-values from the 'Temperature' column of "df1".
fig.update_layout(title='Temprature Throught Timeline:',
                 xaxis_title='Time', yaxis_title='Temprature in Degrees')#This line is updating the layout of "fig" to include a title and x- and y-axis labels.
#The line below is adding a range selector (menu for selecting a range of dates) and range slider (slider for zooming in) to the x-axis of "fig" to allow for easier navigation and exploration of the time series.
fig.update_layout(xaxis=go.layout.XAxis(
    rangeselector=dict(
        buttons=list([dict(label="Whole", step="all"),
                      dict(count=1,label="One Year",step="year",stepmode="todate")
                     ])),
        rangeslider=dict(visible=True),type="date")
)
fig.show()#This line displays the final plot "fig".



# Temperatures across different months

In [122]:
# The 'df1' DataFrame contains the data with columns 'Month' and 'Temprature'.
fig = px.box(df1, 'Month', 'Temprature')

# Update the layout of the plot to add a title indicating the purpose of the visualization.
fig.update_layout(title='Warmest, Coldest and Median Monthly Temperature.')

# Display the box plot.
fig.show()


Insights:
- **Coldest Days** :January has the coldest Days in an Year.
- **Hottest Days**:May has the hottest days in an year, as evident from the box plot.
- **Moderate days**:July is the month with least Standard Daviation, smaller standard deviation indicates that the temperatures in July tend to vary the least, making it a relatively stable and consistent month in terms of temperature. Thus, we can expect most days in July to be warm and fall within a narrower range of temperatures.

# Evaluation on number of clusters

In [123]:
# Import the KMeans class from the scikit-learn library
from sklearn.cluster import KMeans

# Create an empty list to store the Sum of Squared Errors (SSE) for different number of clusters
sse = []

# Extract the 'Temperature' column from the DataFrame and convert it into a 2D numpy array
target = df1['Temprature'].to_numpy().reshape(-1,1)

# Define a range of number of clusters to try
num_clusters = list(range(1, 10))

# Loop over the number of clusters
for k in num_clusters:
    # Create a KMeans object with 'k' clusters
    km = KMeans(n_clusters=k)
    # Fit the KMeans model to the data
    km.fit(target)
    # Append the SSE (inertia_) of the current model to the list
    sse.append(km.inertia_)

# Create a figure using Plotly
fig = go.Figure(data=[
    # Add a line plot for SSE vs. number of clusters
    go.Scatter(x=num_clusters, y=sse, mode='lines'),
    # Add a scatter plot for SSE vs. number of clusters (markers at each data point)
    go.Scatter(x=num_clusters, y=sse, mode='markers')
])

# Update the layout of the figure
fig.update_layout(
    title="Evaluation on number of clusters:",
    xaxis_title="Number of Clusters:",
    yaxis_title="Sum of Squared Distance",
    showlegend=False
)

# Display the figure
fig.show()


A **cluster size of 3** seems a good choice here

In [124]:
# Step 1: Import the necessary libraries
import plotly.express as px
from sklearn.cluster import KMeans

# Step 2: Create a KMeans clustering model with 3 clusters
km = KMeans(3)

# Step 3: Fit the KMeans model to the 'Temprature' column in the DataFrame
# and reshape the data into a 2D array
km.fit(df1['Temprature'].to_numpy().reshape(-1, 1))

# Step 4: Assign the cluster labels to a new column 'Temp Labels' in the DataFrame
df1.loc[:, 'Temp Labels'] = km.labels_

# Step 5: Create a scatter plot using Plotly Express (px)
# with 'Date' on the x-axis, 'Temprature' on the y-axis, and color-coded by 'Temp Labels'
fig = px.scatter(df1, 'Date', 'Temprature', color='Temp Labels')

# Step 6: Update the layout of the plot with a title and axis labels
fig.update_layout(title="Temperature clusters.",
                  xaxis_title="Date", yaxis_title="Temperature")

# Step 7: Display the plot
fig.show()


Insights:
- **Temperature Clusters:** Despite having four distinct seasons, the temperature data can be grouped into three main clusters based on the patterns observed. These clusters likely represent the different temperature zones experienced throughout the year.
- **Coldest Months:**January, February, and December consistently exhibit the coldest temperatures. These months are associated with winter, and the temperature tends to be at its lowest during this time.
- **Hottest Months:**April, May, June, July, August, and September are characterized by the hottest temperatures. These months fall within the summer season, and they consistently experience higher temperatures.
- **Moderate Temperature Months**:March, October, and November are the months with temperatures that fall between the extremes of hot and cold. These months are associated with the transition between seasons, and the temperatures are relatively moderate.

# Frequency chart of temperature readings

In [125]:
# The 'x' parameter specifies the data column for which the histogram will be plotted ('Temprature' column from 'df1')
# The 'nbins' parameter sets the number of bins for the histogram, providing a more detailed distribution of data
# The 'histnorm' parameter is set to 'density' to show the data in terms of probability density, making it easier to compare distributions of different scales

fig = px.histogram(x=df1['Temprature'], nbins=200, histnorm='density')

# Updating the layout of the histogram
# The 'title' parameter sets the title for the plot
# The 'xaxis_title' and 'yaxis_title' parameters set the labels for the x-axis and y-axis, respectively
fig.update_layout(title='Frequency chart of temperature readings:',
                 xaxis_title='Temperature', yaxis_title='Count')


- There is a cluster from 26.2-27.5 and **mean temprature for most months during history has been between 26.8-26.9**.

Let's see if we can get some insights from yearly mean temprature data. I am going to treat this as a time series as well.

# Yearly Mean Temperature and Trendline Over The Years

In [126]:
# Step 1: Calculate the yearly mean temperature by computing the mean across each row of the DataFrame.
df['Yearly Mean'] = df.iloc[:, 1:].mean(axis=1)  # Axis 1 for row-wise mean calculation and axis 0 for columns.

# Step 2: Create a plot using Plotly's Scatter chart with lines and markers to visualize the yearly temperatures.
fig = go.Figure(data=[
    go.Scatter(name='Yearly Temperatures', x=df['YEAR'], y=df['Yearly Mean'], mode='lines'),
    go.Scatter(name='Yearly Temperatures', x=df['YEAR'], y=df['Yearly Mean'], mode='markers')
])

# Step 3: Update the layout of the plot with title and axis labels.
fig.update_layout(title='Yearly Mean Temperature:',
                  xaxis_title='Time', yaxis_title='Temperature in Degrees')

# Step 4: Show the first plot.
fig.show()

# Step 5: Create another scatter plot using Plotly Express with a lowess trendline over the years.
fig = px.scatter(df, x='YEAR', y='Yearly Mean', trendline='lowess')
#The 'trendline' parameter is set to 'lowess'. LOWESS is a non-parametric method that fits a smooth curve to the data by locally averaging nearby points.
#It provides a flexible way to visualize trends in the data without assuming a specific mathematical relationship.
# Step 6: Update the layout of the second plot with title and axis labels.
fig.update_layout(title='Trendline Over The Years:',
                  xaxis_title='Time', yaxis_title='Temperature in Degrees')

# Step 7: Show the second plot.
fig.show()


We can see that the issue of global warning is true.
- **Global Warming Confirmation:** The analysis of yearly mean temperatures reveals a crucial insight confirming the issue of global warming. Prior to 1980, there was no significant upward trend in yearly mean temperatures. However, after 1979, we observe a gradual increase in the yearly mean temperature. This finding aligns with the growing concerns about global warming, as it suggests a recent shift towards rising temperatures.
- **Drastic Temperature Increase:**Notably, the trend becomes more alarming after the year 2015, as the yearly temperatures have increased significantly. This observation highlights the urgent need for climate action to address the accelerating rise in temperatures and its potential consequences.
- **Seasonal Variability Impacting Yearly Temperatures:**The presented figure, though informative, displays a peculiar monthly-like up-down pattern in yearly temperatures. This unexpected pattern may be due to the influence of seasonal variations in the Earth's rotation around the sun. The graph doesn't appear to exhibit a consistent upward trend over the years, indicating that the seasonal variations might play a role in impacting yearly temperature patterns.

# Monthly temperature throughout history

In [127]:
# The x-axis represents the 'Year', the y-axis represents the 'Temprature', and the data is facetted (subplots) by 'Month'.
fig = px.line(df1, 'Year', 'Temprature', facet_col='Month', facet_col_wrap=4)

# Updating the layout of the plot with a title
fig.update_layout(title='Monthly temperature throughout history:')

# Displaying the plot
fig.show()

# Observation: We can see clear positive trendlines in the monthly temperature data over the years.

Let's see if we could find any trend in **seasonal mean tempratures**.

In [128]:
# Step 1: Calculate the mean temperature for Winter months (DEC, JAN, FEB)
df['Winter'] = df[['DEC', 'JAN', 'FEB']].mean(axis=1)

# Step 2: Calculate the mean temperature for Summer months (MAR, APR, MAY)
df['Summer'] = df[['MAR', 'APR', 'MAY']].mean(axis=1)

# Step 3: Calculate the mean temperature for Monsoon months (JUN, JUL, AUG, SEP)
df['Monsoon'] = df[['JUN', 'JUL', 'AUG', 'SEP']].mean(axis=1)

# Step 4: Calculate the mean temperature for Autumn months (OCT, NOV)
df['Autumn'] = df[['OCT', 'NOV']].mean(axis=1)

# Step 5: Create a new DataFrame with only the columns 'YEAR', 'Winter', 'Summer', 'Monsoon', 'Autumn'
seasonal_df = df[['YEAR', 'Winter', 'Summer', 'Monsoon', 'Autumn']]

# Step 6: Reshape the DataFrame using 'melt' function to transform the data from wide to long format
seasonal_df = pd.melt(seasonal_df, id_vars='YEAR', value_vars=seasonal_df.columns[1:])

# Step 7: Rename the columns for better clarity
seasonal_df.columns = ['Year', 'Season', 'Temperature']


# Seasonal mean temperatures throughout the years

In [129]:
# Import the necessary libraries
import plotly.express as px

# Create a scatter plot using the 'seasonal_df' DataFrame
# The x-axis represents the 'Year' column, and the y-axis represents the 'Temperature' column
# The 'facet_col' parameter is used to create separate subplots for each 'Season' value
# The 'facet_col_wrap' parameter specifies that each row should contain a maximum of 2 subplots
# The 'trendline' parameter adds a linear regression trendline to the scatter plot
fig = px.scatter(seasonal_df, 'Year', 'Temperature', facet_col='Season', facet_col_wrap=2, trendline='ols')

# Update the layout of the figure with a title
fig.update_layout(title='Seasonal mean temperatures throughout the years:')

# Display the plot
fig.show()


We can again see a positive trendline between temprature and time. The trendline does not have a very high positive correlation with years but still it is not negligable.The presence of this trendline indicates a gradual, increase or decrease in temperatures over the years for different months.

Let's try to find out if we can get something out of an animation?

In [130]:
px.scatter(df1, 'Month', 'Temprature', size='Temprature', animation_frame='Year')

On first look, we can see some fluctuations but that doesn't give much of insights for us. However, if we again see by arranging bar below to early years and late years we can notice the change. But this is certainly not the best way to visualize it.
Let's find some better way.


# Forecasting

Let's try to forecast monthly mean temprature for year 2018.

In [131]:
# Importing the necessary libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Creating a copy of the DataFrame with only 'Year', 'Month', and 'Temperature' columns
df2 = df1[['Year', 'Month', 'Temprature']].copy()

# Using one-hot encoding to convert categorical features into numerical ones
df2 = pd.get_dummies(df2)
df2.columns = df2.columns.str.replace(' ', '')

# Separating the target variable ('Temperature') and the features ('Year' and 'Month')
y = df2[['Temprature']]
x = df2.drop(columns='Temprature')

# Initializing the DecisionTreeRegressor model
dtr = DecisionTreeRegressor()


# Splitting the data into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3)

# Training the DecisionTreeRegressor model on the training data
dtr.fit(train_x, train_y)

# Making predictions on the test data
pred = dtr.predict(test_x)

# Calculating the R-squared score to evaluate the model's performance
r2_score(test_y, pred)


0.9617403004317329

In [132]:
# Step 1: Create a new DataFrame for the year 2018
next_Year = df1[df1['Year']==2017][['Year', 'Month']]
next_Year.Year.replace(2017,2018, inplace=True)
next_Year= pd.get_dummies(next_Year)

# Step 2: Use the trained DecisionTreeRegressor model to predict the temperature for 2018
temp_2018 = dtr.predict(next_Year)

# Step 3: Create a new DataFrame for the year 2018
temp_2018 = {'Month':df1['Month'].unique(), 'Temprature':temp_2018}
temp_2018=pd.DataFrame(temp_2018)
temp_2018['Year'] = 2018
temp_2018


Unnamed: 0,Month,Temprature,Year
0,JAN,20.92,2018
1,FEB,23.08,2018
2,MAR,25.58,2018
3,APR,29.56,2018
4,MAY,30.47,2018
5,JUN,29.7,2018
6,JUL,28.18,2018
7,AUG,28.17,2018
8,SEP,28.11,2018
9,OCT,27.24,2018


In [133]:
# Splitting the data into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3)

# Training the DecisionTreeRegressor model on the training data
dtr.fit(train_x, train_y)

# Making predictions on the test data
pred = dtr.predict(test_x)

# Calculating the R-squared score to evaluate the model's performance
r2_score_value = r2_score(test_y, pred)
r2_score_value

0.9578825880731139

r2 value close to 1 indicates that our predictive model is performing well in explaining the variance in the data. However, it's essential to note that the r2 statistic alone is not sufficient to fully assess the overall goodness of a model, as there are other evaluation metrics to consider But, that we'll discuss later.

Now, Let's see the forecasted data for 2018.


In [134]:
model= DecisionTreeRegressor(random_state=0)
model.fit(train_x,train_y)

In [135]:
# df1 contains the forecasted temperature data
forecasted_temp = df1.groupby('Year')['Temprature'].mean().reset_index()
fig = go.Figure(data=[
    go.Scatter(name='Yearly Mean Temprature', x=forecasted_temp['Year'], y=forecasted_temp['Temprature'], mode='lines'),
    go.Scatter(name='Yearly Mean Temprature', x=forecasted_temp['Year'], y=forecasted_temp['Temprature'], mode='markers')
])
fig.update_layout(title='Forecasted Temperature:',
                 xaxis_title='Time', yaxis_title='Temperature in Degrees')
fig.show()


In [136]:
# Reshape test_y to be a column vector
test_y = test_y.values.reshape(-1, 1)

# Generate the scatter plot
fig = px.scatter(x=test_y.flatten(), y=pred.flatten(), labels={'x':'Actual Values','y':'Predicted Values'})
fig.show()

In [137]:
# Create a subset of data for the year 2017 containing 'Year' and 'Month' columns
dummy_data_2017 = df1[df1['Year'] == 2017][['Year', 'Month']]

# Convert categorical data to numerical using one-hot encoding
dummy_data_2017 = pd.get_dummies(dummy_data_2017)

# Define a function to forecast temperatures for a given year using the provided model and data
def forecasted_temp(Year, model, dummy_data):
    # Create a copy of the dummy data to avoid modifying the original data
    dummy_data_year = dummy_data.copy()
    # Replace the 'Year' value with the input year to predict temperatures for that year
    dummy_data_year['Year'].replace(2017, Year, inplace=True)
    # Predict temperatures using the model for the specified year
    temp_of_year = model.predict(dummy_data_year)
    # Return the mean, maximum, and minimum forecasted temperatures for the year
    return temp_of_year.mean(), temp_of_year.max(), temp_of_year.min()


In [138]:
# Forecast temperature for 2018
mean_temp_2018, min_temp_2018, max_temp_2018 = forecasted_temp(2018, model, dummy_data_2017)
print("Mean Temperature for 2018:", mean_temp_2018)
print("Max Temperature for 2018:", max_temp_2018)
print("Min Temperature for 2018:", min_temp_2018)

Mean Temperature for 2018: 26.198333333333334
Max Temperature for 2018: 20.92
Min Temperature for 2018: 30.47


# Give mean,max and min temperature of the Year entered by the User!!

In [141]:
# Step 1: Ask the user to input the year they want to forecast
user_input_year = int(input("Enter the year you want to forecast: "))
# Step 2: Forecast temperature for the user input year
mean_temp_user_input, max_temp_user_input, min_temp_user_input = forecasted_temp(user_input_year, model, dummy_data)
mean_temp_2018, min_temp_2018, max_temp_2018 = forecasted_temp(2018, model, dummy_data_2017)

# Step 3: Display the forecasted temperatures for the user input year
print("Forecasted Mean Temperature for", user_input_year, ":", mean_temp_user_input)
print("Forecasted Max Temperature for", user_input_year, ":", max_temp_user_input)
print("Forecasted Min Temperature for", user_input_year, ":", min_temp_user_input)

# Step 4: Forecast temperature for the previous year
mean_temp_previous_year, max_temp_previous_year, min_temp_previous_year = forecasted_temp(user_input_year - 1, model,dummy_data)


Enter the year you want to forecast: 2011
Forecasted Mean Temperature for 2011 : 24.990833333333327
Forecasted Max Temperature for 2011 : 28.92
Forecasted Min Temperature for 2011 : 18.32
