In [None]:
"""
pip install pandas 
pip install scipy
pip install matplotlib
pip install ipython
"""

In [None]:
from scipy.signal import periodogram
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import numpy as np

In [None]:
# reading dataset file into a dataframe called 'data'
data = pd.read_csv('./Sub_Division_IMD_2017.csv')
display(data)    #  Displaying the DataFrame that we just read

In [None]:
# Data Exploration:
# After inspecting the data, I see that the data is of 117 years of only some selected states in India, with their rainfall data for each month.
# The data ranges from 1901-2017. With a data of 100+ years, we can analyze various aspects.

# Analytical Goals:
# 1. Long-term trend
# 2. Seasonal Pattern
# 3. Months with max-min rainfall and variability
# 4. Extreme events in our centurial data
# 5. State-wise analysis
# 6. Trend-line (direction and pattern of my data)
# 7. Cyclic patterns
# 8. Comparing two different months of the same and different years
# 9. Time-series of the state with max-min rainfall and understanding the pattern for the same
# 10. Spatial Analysis
# 11. Forecast the future rainfall patterns
# 12. Grouping the data into groups of 15-20 years and analyzing the variability

In [None]:
import pandas as pd

df = data.dropna()

# Compute the periodogram
freq, power = periodogram(df['ANNUAL'], fs=1, scaling='spectrum', nfft=1024)

# Plot the periodogram
plt.plot(1 / freq, power)
plt.xlabel('Period (Years)')
plt.ylabel('Power Spectral Density')
plt.title('Periodogram of Annual Rainfall')
plt.show()

In [None]:
df = data.dropna()

# Compute the periodogram
freq, power = periodogram(df['ANNUAL'], fs=1, scaling='spectrum', nfft=1024)

# Check and handling division by zero
# to resolve the warning
non_zero_freq = freq.copy()
non_zero_freq[non_zero_freq == 1] = 1e-10  

# Plot the periodogram
plt.plot(1 / non_zero_freq, power)
plt.xlabel('Period (Years)')
plt.ylabel('Power Spectral Density')
plt.title('Periodogram of Annual Rainfall')
plt.show()

# I'm able to plot the graph but unable to res was not able to resolve the warning, 

In [None]:
# omitting last 4 columns (JF,MAM,JJA, OND)- seasonal analysis
data1 = data.iloc[:, :-4]

In [None]:
Uttarakhand = (data1 == 'Uttarakhand').any(axis=1)
Uttarakhand = data1[Uttarakhand]

# Statistical results
mean_value = np.mean(Uttarakhand['ANNUAL'])
print(mean_value)
median_value = np.median(Uttarakhand['ANNUAL'])
print(median_value)

std_dev = np.std(Uttarakhand['ANNUAL'])
print(std_dev)
variance = np.var(Uttarakhand['ANNUAL'])
print(variance)

skewness = Uttarakhand['ANNUAL'].skew()
print(skewness)
kurtosis = Uttarakhand['ANNUAL'].kurt()
print(kurtosis)

minimum = np.min(Uttarakhand['ANNUAL'])
print(minimum)
maximum = np.max(Uttarakhand['ANNUAL'])
print(maximum)

q1 = np.percentile(Uttarakhand['ANNUAL'] , 25)
print(q1)
q3 = np.percentile(Uttarakhand['ANNUAL'] , 75)
print(q3)

iqr = q3 - q1
print(iqr)

In [None]:
# Create a box plot for all columns
data1.boxplot(rot=45, figsize=(12, 8))
plt.title('Box Plot for Each Column')
plt.show()


# Box-plot( box-and-whisker plot) helps us to visualize the distribution of data
# helps visualize the spread, skewness, and central tendency of the data
# We can see outliers in our box-plot.

In [None]:

# Outlier rejection

# Set a threshold for Z-scores (e.g., 3)
threshold = 3

# Create an empty DataFrame to store outliers
outliers_df = pd.DataFrame()

# Loop through each numeric column
for column_name in data1.select_dtypes(include=np.number).columns:
    # Calculate Z-scores
    z_scores = np.abs((data1[column_name] - data1[column_name].mean()) / data1[column_name].std())

    # Identify outliers
    column_outliers = data1[z_scores > threshold]

    # Append outliers to the outliers_df
    outliers_df = pd.concat([outliers_df, column_outliers])

# Remove duplicates from outliers_df
outliers_df = outliers_df.drop_duplicates()

# Remove outliers from the original DataFrame
df_cleaned = data1.drop(outliers_df.index)

# Display information about removed outliers
print(f'Number of outliers removed: {len(outliers_df)}')
print('Outliers:')
print(outliers_df)

In [None]:
# Create a box plot for all columns
df_cleaned.boxplot(rot=45, figsize=(12, 8))
plt.title('Box Plot for Each Column')
plt.show()

# Box-plot here has no outliers, will be using the clean data for further analysis.

In [None]:
#  Seasonal Analysis

# Reading the set again , In this analysis I'm using the last 4 columns of my set
# I'm plotting months with time 
df = pd.read_csv("./Sub_Division_IMD_2017.csv")
df = df.dropna()

# as i have more than 20 states, choosing single state and analyzing which seasons have max-min rainfall and variability
# JF  - Jan and feb
# MAM - mar, april and may
# JJAS - june, june , august and september
# OND - october, november and december 
seasons = (df == 'Uttarakhand').any(axis=1)
seasons = df[seasons]

# Omitting the rest of the columns
seasons =  seasons.iloc[:, :2].join(seasons.iloc[:, -4:])
seasons.plot( x = 'YEAR' , figsize=(30,10), linestyle = '--' , marker = '.')

# the graph shows JUNE, JULY, AUGUST AND SEPT are the months with max rainfall
# we can conclude JJAS - rainy season in india(uttarakhand)
# we can check for all the states similary.

In [None]:
# Seasonal patterns
"""
To find seasonal patterns in a rainfall dataset spanning 100 years, I am using time series analysis
seasonal decomposition, which separates the data into its underlying components:
trend, seasonal, and residual.
Trend will help us look for long-term patterns or trends in the data, if it is increasing, decreasing, or relatively stable over time?
Seasonal helps us Identify repeating patterns that occur at regular intervals, seasonality can be noted with the peaks and trough
Residual Checks for any remaining patterns or irregularities in the data.
"""


from statsmodels.tsa.seasonal import seasonal_decompose

Uttarakhand_cleaned = (df_cleaned == 'Uttarakhand').any(axis=1)
Uttarakhand_cleaned = df_cleaned[Uttarakhand_cleaned]
print(Uttarakhand_cleaned)

decomposition = seasonal_decompose(Uttarakhand_cleaned['ANNUAL'], model = 'multiplicative', period = 12)
plt.figure(figsize = (20,8))
plt.subplot(4,1,1)
plt.plot(Uttarakhand_cleaned.index, decomposition.trend, label='Trend', color = 'blue')
plt.legend()
plt.subplot(4,1,2)
plt.plot(Uttarakhand_cleaned.index, decomposition.seasonal, label='seasonal', color = 'green')
plt.legend()
plt.subplot(4,1,3)
plt.plot(Uttarakhand_cleaned.index, decomposition.resid, label='residual', color = 'red')
plt.legend()
plt.subplot(4,1,4)
plt.plot(Uttarakhand_cleaned.index, Uttarakhand_cleaned['ANNUAL'], label='original', color = 'black')
plt.legend()
plt.show()

In [None]:
"""
running average or moving average
Smoothing the data using running average or moving averageto to reduce short-term variability and highlight long term trends.
SMA AND EMA
"""

# simple Moving averages

Uttarakhand_cleaned['Simple Moving Average'] = Uttarakhand_cleaned['ANNUAL'].rolling(window = window_size).mean()

plt.plot(Uttarakhand_cleaned.index , Uttarakhand_cleaned['ANNUAL'], label = 'Original', color = 'blue')
plt.plot(Uttarakhand_cleaned.index , Uttarakhand_cleaned['Simple Moving Average'], label = '12 month Moving average', color = 'red')

# exponential Moving averages

window_size = 10
Uttarakhand_cleaned['Exponentially Moving Average'] = Uttarakhand_cleaned['ANNUAL'].ewm(span = window_size , adjust = False ).mean()

plt.plot(Uttarakhand_cleaned.index , Uttarakhand_cleaned['ANNUAL'], label = 'Original', color = 'blue')
plt.plot(Uttarakhand_cleaned.index , Uttarakhand_cleaned['Exponentially Moving Average'], label = '12 month Moving average', color = 'red')

In [None]:
# Trend line
# Trend line signifies the general direction of the pattern of our data.

x = Uttarakhand_cleaned['YEAR']
y = Uttarakhand_cleaned['ANNUAL']

coefficients = np.polyfit(x,y,1)
polyfit_line = np.poly1d(coefficients)

fit_y = polyfit_line(x)

plt.figure(figsize = (25,6))
plt.scatter(x,y,label = 'Data Points')
plt.plot(x, fit_y, color = 'red')
plt.show()

In [None]:
df_merging = df_cleaned.iloc[:, :15]
df_merging 

# saving the cleaned data for further analysis.

In [None]:
df_cleaned = df_cleaned.iloc[:, :14]

# Setting States and Year as indices and then converting the dataframe into long format using stack() method
df_cleaned = df_cleaned.set_index(['YEAR', 'CITY']).stack().reset_index()

# Defining mapper dictionary to change the column names of last 2 columns
mapper = {df_cleaned.columns.values[2]:'MONTHS',
          df_cleaned.columns.values[3]:'RAINFALL IN MM'}

# Renaming the columns
df_cleaned.rename(columns=mapper, inplace=True)

# Displaying the dataframe 
display(df_cleaned)

In [None]:

# 1. Temporal Analysis - Time Series of Yearly Rainfall
yearly_rainfall = df_cleaned.groupby('YEAR')['RAINFALL IN MM'].sum()
yearly_rainfall.plot(kind='line', marker='o', title='Yearly Rainfall Trend')
plt.xlabel('Year')
plt.ylabel('Total Rainfall (mm)')
plt.show()

# 2. Seasonal Patterns - Average Rainfall for Each Month
monthly_average = df_cleaned.groupby('MONTHS')['RAINFALL IN MM'].mean()
monthly_average.plot(kind='bar', title='Average Monthly Rainfall')
plt.xlabel('Month')
plt.ylabel('Average Rainfall (mm)')
plt.show()

# 3. Monthly Distribution - Box Plot
plt.figure(figsize=(8, 6))
plt.boxplot([df_cleaned[df_cleaned['MONTHS'] == month]['RAINFALL IN MM'] for month in df_cleaned['MONTHS'].unique()],
            labels=df_cleaned['MONTHS'].unique())
plt.title('Monthly Rainfall Distribution')
plt.xlabel('Month')
plt.ylabel('Rainfall (mm)')
plt.show()

In [None]:
# since the data is of 117 years, lets do some grouped-analysis
# I'll making groups of 15 years, and comparing the groups

# re-reading the data
df = pd.read_csv("./Sub_Division_IMD_2017.csv")


def create_year_groups(dataframe, group_size):
    groups = []
    for start_year in range(dataframe['YEAR'].min(), dataframe['YEAR'].max() + 1, group_size):
        end_year = start_year + group_size - 1
        group = dataframe[(dataframe['YEAR'] >= start_year) & (dataframe['YEAR'] <= end_year)]
        groups.append(group)
    return groups


groups_of_15_years = create_year_groups(df, 15)


for i, group in enumerate(groups_of_15_years):
    max_rainfall_year = group.loc[group['ANNUAL'].idxmax()]['YEAR']
    print(f"Group {i + 1}: Maximum Rainfall Year - {max_rainfall_year}")

# Maximum rainfall states 
max_rainfall_states_overall = df.loc[df.groupby('CITY')['ANNUAL'].idxmax()][['CITY', 'YEAR', 'ANNUAL']]
print("\nMaximum Rainfall States Overall:")
print(max_rainfall_states_overall)

# Minimum rainfall 
min_rainfall_group = min(groups_of_15_years, key=lambda x: x['ANNUAL'].sum())
print("\nMinimum Rainfall Among the Groups:")
print(min_rainfall_group[['YEAR', 'ANNUAL']])

In [None]:
# Spatial Analysis

main_data = df_merging
# Loading the dataset containing latitude and longitude
# 2nd dataset, will be merging both the datasets for spatial analysis
lat_lon_data = pd.read_csv('./poptable.csv')

# Check the columns in each DataFrame
print("Columns in main_data:", main_data.columns)
print("Columns in lat_lon_data:", lat_lon_data.columns)

lat_lon_data['CITY'] = lat_lon_data['CITY'].str.strip()
main_data['CITY'] = main_data['CITY'].str.strip()

lat_lon_data.dropna(subset=['CITY'], inplace=True)
main_data.dropna(subset=['CITY'], inplace=True)

lat_lon_data['CITY'] = lat_lon_data['CITY'].astype(str)
main_data['CITY'] = main_data['CITY'].astype(str)

lat_lon_data['CITY'] = lat_lon_data['CITY'].str.replace('[^\w\s]', '')
main_data['CITY'] = main_data['CITY'].str.replace('[^\w\s]', '')

In [None]:
# Convert all string columns to uppercase
df_merging['CITY'] = df_merging['CITY'].str.upper()  # Example for a specific column

# Display the updated DataFrame
print(df_merging)

In [None]:
# Merge the datasets based on the 'CITY' column
merged_data = pd.merge(df_merging, lat_lon_data[['CITY', 'LATITUDE', 'LONGITUDE']], on='CITY', how='left')

# Display the merged dataset
merged_data

In [None]:
merged_data = merged_data.dropna()
# dropping the null values
merged_data
# displaying the merged dataset

In [None]:
import geopandas as gpd

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(merged_data, geometry=gpd.points_from_xy(merged_data["LONGITUDE"], merged_data["LATITUDE"]))

# Create a choropleth map
gdf.plot(column="ANNUAL", cmap="viridis", legend=True)
plt.title("Spatial Distribution of Annual Rainfall")
plt.show()

In [None]:
import pandas as pd
from geopy.geocoders import Nominatim


merged_data = merged_data.drop_duplicates().dropna()
# Create a geolocator
geolocator = Nominatim(user_agent="state_geocoding")

# Geocode each state and add latitude and longitude to the DataFrame
merged_data['location'] = merged_data['CITY'].apply(geolocator.geocode)
merged_data['LATITUDE'] = merged_data['location'].apply(lambda loc: loc.latitude if loc else None)
merged_data['LONGITUDE'] = merged_data['location'].apply(lambda loc: loc.longitude if loc else None)

# Drop unnecessary columns
merged_data_unique_city = merged_data_unique_city.drop(['location'], axis=1)

In [None]:

# Scatter plot for two specific months
month1 = "JAN"
month2 = "JUL"
plt.scatter(data[month1], data[month2])
plt.xlabel(month1)
plt.ylabel(month2)
plt.title("Correlation between " + month1 + " and " + month2)
plt.show()