In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
df = pd.read_csv('suicide_data.csv')
# Display the first 3 rows
df.head(3)

In [None]:
#shape of the data
df.shape

In [None]:
#data info
df.info

## Data Cleaning

### Handling missing values. 


In [None]:
df.isnull().sum()

In [None]:
#removing HDI column as most of it is null
df.drop(columns=['HDI for year'], inplace=True)
# droping rows of suicide no with missing values
df.dropna(subset=['suicides_no', 'population'], inplace=True)

In [None]:
df.isnull().sum()

### Identifying and dealing with outliers.

In [None]:
# Select numerical columns 
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Set up subplots
n_cols = 3  # Number of columns per row in the subplot grid
n_rows = (len(numerical_cols) + n_cols - 1) // n_cols  # Calculate required rows

# Create figure and subplots
plt.figure(figsize=(15, 5 * n_rows))
for i, column in enumerate(numerical_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(data=df, y=column, color='skyblue')
    plt.title(f'Box plot of {column}')
    plt.ylabel('')  # Remove y-axis label for clarity

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
Q1 = df['suicides/100k pop'].quantile(0.25)
Q3 = df['suicides/100k pop'].quantile(0.75)
IQR = Q3 - Q1
lower_rate = Q1 - 1.5 * IQR
upper_rate = Q3 + 1.5 * IQR

Q1 = df['suicides_no'].quantile(0.25)
Q3 = df['suicides_no'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

# Filter outliers
df_clean = df[(df['suicides/100k pop'].between(lower_rate, upper_rate)) & (df['suicides_no'].between(lower, upper))]

### Data transformation and normalization.


In [None]:
# Simplify age groups 
df_clean.loc[:, 'age'] = df['age'].str.replace(' years', '')
df_clean.loc[:,'gdp_for_year'] = df['gdp_for_year'].str.replace(',', '')



In [None]:
from sklearn.preprocessing import MinMaxScaler

#nomalization

num_cols = df_clean.select_dtypes(include=np.number).columns
scaler = MinMaxScaler()
df_clean.loc[:,num_cols] = scaler.fit_transform(df_clean[num_cols])

#normalized data
df_clean.head(3)

## Exploratory Data Analysis (EDA) 


### Summary statistics. 

In [None]:
df.describe()

In [None]:
# Cross-tabulation (e.g., suicides by age and sex)
(pd.crosstab(df['age'], df['sex'], values=df['suicides/100k pop'], aggfunc='mean'))

### Data visualization

In [None]:
df_clean.hist(figsize=(10,12), bins = 20)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate average suicide rate per country
country_rates = df_clean.groupby('country')['suicides/100k pop'].mean().reset_index()
country_rates = country_rates.sort_values('suicides/100k pop', ascending=False)

# Create a simple bar plot
plt.figure(figsize=(10, 30))  # Increase height to accommodate all countries
sns.barplot(y='country',x='suicides/100k pop',data=country_rates,color='skyblue',edgecolor='black')

# Add titles/labels
plt.title('Average Suicide Rate by Country (per 100k Population)', fontsize=14)
plt.xlabel('Suicides per 100k Population', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=8)  # Smaller font for country names

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='generation', y='suicides/100k pop', data=df_clean)
plt.title('Suicide Rates by Generation')
plt.xticks(rotation=45)
plt.show()

# Box plot by sex
plt.figure(figsize=(10, 6))
sns.barplot(x='sex', y='suicides_no', data=df_clean)
plt.title('Suicide Rates by sex')
plt.xticks(rotation=45)
plt.show()

In [None]:
#scatter plot comparison of gdp and  suicide rtate

sns.scatterplot(x='suicides/100k pop', y='gdp_per_capita ($)', data=df_clean, hue='sex',size=0.7)

### Identifying patterns and insights. 

In [None]:
sns.heatmap(df_clean[['suicides/100k pop', 'gdp_per_capita ($)', 'population','suicides_no','year']].corr(), annot=True)

In [None]:
sns.lineplot(data=df, x='year', y='suicides/100k pop', hue='generation',errorbar=None)

In [None]:
numerical_cols = ['suicides/100k pop', 'suicides_no', 'population', 'gdp_per_capita ($)', 'gdp_for_year']
df_numerical = df_clean[numerical_cols]

In [None]:
correlation_matrix = df_numerical.corr(method='pearson')  # or 'spearman' for non-linear relationships
print(correlation_matrix)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix,annot=True)
plt.title('Correlation Matrix of Numerical Variables', fontsize=14)
plt.show()

## covariance

In [None]:
#covariance
# Calculate covariance matrix
cov_matrix = df_numerical.cov()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df_numerical), columns=df_numerical.columns)
cov_matrix_std = df_standardized.cov()
sns.heatmap(cov_matrix_std, annot=True, cmap='coolwarm', fmt=".2f")

In [None]:
mean_values = df_clean.mean()
median_values = df_clean.median()
mode_values = df_clean.mode().iloc[0]  # Take the first mode (mode can have multiple values)

# Combine the statistics into a single DataFrame for plotting
central_tendency = pd.DataFrame({
    'Mean': mean_values,
    'Median': median_values,
    'Mode': mode_values
})

# Plotting
plt.figure(figsize=(14, 7))
bar_width = 0.25  # Width of individual bars
index = range(len(central_tendency))  # Position of bars

# Create separate bars for mean, median, and mode
plt.bar([i - bar_width for i in index], central_tendency['Mean'], bar_width, label='Mean', color='blue')
plt.bar(index, central_tendency['Median'], bar_width, label='Median', color='orange')
plt.bar([i + bar_width for i in index], central_tendency['Mode'], bar_width, label='Mode', color='green')

# Customize the chart
plt.xticks(index, central_tendency.index, rotation=45, ha='right')
plt.title('Central Tendency Measures (Mean, Median, Mode)')
plt.ylabel('Values')
plt.xlabel('Features')
plt.legend()
plt.tight_layout()
plt.show()

### Hypothesis testing

In [None]:
from scipy.stats import ttest_ind

# Split data into male and female groups
male = df_clean[df_clean['sex'] == 'male']['suicides/100k pop']
female = df_clean[df_clean['sex'] == 'female']['suicides/100k pop']

# Perform t-test
t_stat, p_value = ttest_ind(male, female, nan_policy='omit')
print(f"T-statistic: {t_stat:.2f}, P-value: {p_value:.4f}")

# Interpret results
if p_value < 0.05:
    print("Reject null hypothesis: Suicide rates differ significantly by gender.")
else:
    print("No significant difference in suicide rates by gender.")