In [1]:
pip install pandas 
pip install scipy
pip install matplotlib
pip install ipython

Note: you may need to restart the kernel to use updated packages.


In [None]:
from scipy.signal import periodogram
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import numpy as np

In [None]:
# reading dataset file into a dataframe called 'data'
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Sub_Division_IMD_2017.csv')
display(data)    #  Displaying the DataFrame that we just read

In [None]:
# Data Exploration:
# After inspecting the data, I see that the data is of 117 years of only some selected states in India, with their rainfall data for each month.
# The data ranges from 1901-2017. With a data of 100+ years, we can analyze various aspects.

# Analytical Goals:
# 1. Long-term trend
# 2. Seasonal Pattern
# 3. Months with max-min rainfall and variability
# 4. Extreme events in our centurial data
# 5. State-wise analysis
# 6. Trend-line (direction and pattern of my data)
# 7. Cyclic patterns
# 8. Comparing two different months of the same and different years
# 9. Time-series of the state with max-min rainfall and understanding the pattern for the same
# 10. Spatial Analysis
# 11. Forecast the future rainfall patterns
# 12. Grouping the data into groups of 15-20 years and analyzing the variability

In [None]:
# omitting last 4 columns (JF,MAM,JJA, OND)
data1 = data.iloc[:, :-4]

In [None]:
Uttarakhand = (data1 == 'Uttarakhand').any(axis=1)
Uttarakhand = data1[Uttarakhand]

mean_value = np.mean(Uttarakhand['ANNUAL'])
print(mean_value)
median_value = np.median(Uttarakhand['ANNUAL'])
print(median_value)

std_dev = np.std(Uttarakhand['ANNUAL'])
print(std_dev)
variance = np.var(Uttarakhand['ANNUAL'])
print(variance)

skewness = Uttarakhand['ANNUAL'].skew()
print(skewness)
kurtosis = Uttarakhand['ANNUAL'].kurt()
print(kurtosis)

minimum = np.min(Uttarakhand['ANNUAL'])
print(minimum)
maximum = np.max(Uttarakhand['ANNUAL'])
print(maximum)

q1 = np.percentile(Uttarakhand['ANNUAL'] , 25)
print(q1)
q3 = np.percentile(Uttarakhand['ANNUAL'] , 75)
print(q3)

iqr = q3 - q1
print(iqr)

In [None]:
# Create a box plot for all columns
data1.boxplot(rot=45, figsize=(12, 8))
plt.title('Box Plot for Each Column')
plt.show()


# Box-plot( box-and-whisker plot) helps us to visualize the distribution of data
# helps visualize the spread, skewness, and central tendency of the data
# We can see outliers in our box-plot.

In [None]:

# Outlier rejection

# Set a threshold for Z-scores (e.g., 3)
threshold = 3

# Create an empty DataFrame to store outliers
outliers_df = pd.DataFrame()

# Loop through each numeric column
for column_name in data1.select_dtypes(include=np.number).columns:
    # Calculate Z-scores
    z_scores = np.abs((data1[column_name] - data1[column_name].mean()) / data1[column_name].std())

    # Identify outliers
    column_outliers = data1[z_scores > threshold]

    # Append outliers to the outliers_df
    outliers_df = pd.concat([outliers_df, column_outliers])

# Remove duplicates from outliers_df
outliers_df = outliers_df.drop_duplicates()

# Remove outliers from the original DataFrame
df_cleaned = data1.drop(outliers_df.index)

# Display information about removed outliers
print(f'Number of outliers removed: {len(outliers_df)}')
print('Outliers:')
print(outliers_df)

In [None]:
# Create a box plot for all columns
df_cleaned.boxplot(rot=45, figsize=(12, 8))
plt.title('Box Plot for Each Column')
plt.show()

In [None]:
#  Seasonal Analysis

# Reading the set again , In this analysis I', using the last 4 columns of my set
# I'm plotting months with time 
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Sub_Division_IMD_2017.csv")
df = df.dropna()

# as i have more than 20 states, choosing single state and analyzing which seasons have max-min rainfall and variability
# JF  - Jan and feb
# MAM - mar, april and may
# JJAS - june, june , august and september
# OND - october, november and december 
seasons = (df == 'Uttarakhand').any(axis=1)
seasons = df[seasons]

# Omitting the rest of the columns
seasons =  seasons.iloc[:, :2].join(seasons.iloc[:, -4:])
seasons.plot( x = 'YEAR' , figsize=(30,10), linestyle = '--' , marker = '.')

# the graph shows JUNE, JULY, AUGUST AND SEPT are the months with max rainfall
# we can conclude JJAS - rainy season in india(uttarakhand)
# we can check for all the states similary.

In [None]:
# Seasonal patterns
"""
To find seasonal patterns in a rainfall dataset spanning 100 years, I am using time series analysis
seasonal decomposition, which separates the data into its underlying components:
trend, seasonal, and residual.
Trend will help us look for long-term patterns or trends in the data, if it is increasing, decreasing, or relatively stable over time?
Seasonal helps us Identify repeating patterns that occur at regular intervals, seasonality can be noted with the peaks and trough
Residual Checks for any remaining patterns or irregularities in the data.
"""


from statsmodels.tsa.seasonal import seasonal_decompose

Uttarakhand_cleaned = (df_cleaned == 'Uttarakhand').any(axis=1)
Uttarakhand_cleaned = df_cleaned[Uttarakhand_cleaned]
print(Uttarakhand_cleaned)

decomposition = seasonal_decompose(Uttarakhand['ANNUAL'], model = 'multiplicative', period = 12)
plt.figure(figsize = (20,8))
plt.subplot(4,1,1)
plt.plot(Uttarakhand.index, decomposition.trend, label='Trend', color = 'blue')
plt.legend()
plt.subplot(4,1,2)
plt.plot(Uttarakhand.index, decomposition.seasonal, label='seasonal', color = 'green')
plt.legend()
plt.subplot(4,1,3)
plt.plot(Uttarakhand.index, decomposition.resid, label='residual', color = 'red')
plt.legend()
plt.subplot(4,1,4)
plt.plot(Uttarakhand.index, Uttarakhand['ANNUAL'], label='original', color = 'black')
plt.legend()
plt.show()