In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (already renamed)
df = pd.read_csv("Finance_data.csv")

# -------------------------------
# Step 1: Streamline / Clean Data
# -------------------------------

# Merge Objectives and Purpose into one (if they overlap heavily)
df['Goal'] = df[['Objective', 'Purpose']].fillna('').agg(' | '.join, axis=1).str.strip(' |')

# Simplify savings objectives (merge with Goal if redundant)
df['Goal'] = df[['Goal', 'What are your savings objectives?']].fillna('').agg(' | '.join, axis=1).str.strip(' |')

# Consolidate "Reasons" into one multi-label field
df['Reasons'] = df[['Reason_Equity', 'Reason_Mutual', 'Reason_Bonds', 'Reason_FD']].fillna('').agg(' | '.join, axis=1).str.strip(' |')

# Drop empty duplicates
df['Goal'] = df['Goal'].replace('', pd.NA)
df['Reasons'] = df['Reasons'].replace('', pd.NA)

# -------------------------------
# Step 2: Basic Visualizations
# -------------------------------

# Gender distribution
plt.figure(figsize=(6,4))
sns.countplot(x="gender", data=df, palette="Set2")
plt.title("Gender Distribution")
plt.savefig("gender_distribution.png", dpi=300, bbox_inches="tight")
plt.close()

# Age distribution
plt.figure(figsize=(6,4))
sns.histplot(df['age'], bins=10, kde=True)
plt.title("Age Distribution")
plt.savefig("age_distribution.png", dpi=300, bbox_inches="tight")
plt.close()

# Primary investment avenue
plt.figure(figsize=(8,5))
sns.countplot(y="Avenue", data=df, order=df['Avenue'].value_counts().index, palette="viridis")
plt.title("Primary Investment Avenues")
plt.savefig("primary_investment_avenues.png", dpi=300, bbox_inches="tight")
plt.close()

# -------------------------------
# Step 3: Objectives & Factors
# -------------------------------

# Explode multi-label Goals
all_goals = df['Goal'].dropna().str.split('|').explode().str.strip()
goal_counts = all_goals.value_counts()

plt.figure(figsize=(8,5))
sns.barplot(x=goal_counts.values, y=goal_counts.index, palette="coolwarm")
plt.title("Investment Goals")
plt.xlabel("Frequency")
plt.ylabel("Goal")
plt.savefig("investment_goals.png", dpi=300, bbox_inches="tight")
plt.close()

# Explode Factors
all_factors = df['Factor'].dropna().str.split(',').explode().str.strip()
factor_counts = all_factors.value_counts()

plt.figure(figsize=(8,5))
sns.barplot(x=factor_counts.values, y=factor_counts.index, palette="magma")
plt.title("Factors Considered When Investing")
plt.xlabel("Frequency")
plt.ylabel("Factor")
plt.savefig("factors_investing.png", dpi=300, bbox_inches="tight")
plt.close()

# -------------------------------
# Step 4: Cross-Demographic Insights
# -------------------------------

# Goals by Gender
goals_by_gender = df.dropna(subset=['gender','Goal']).copy()
goals_by_gender = goals_by_gender.assign(Goal=goals_by_gender['Goal'].str.split('|')).explode('Goal')
goals_by_gender['Goal'] = goals_by_gender['Goal'].str.strip()

plt.figure(figsize=(10,6))
sns.countplot(data=goals_by_gender, x="Goal", hue="gender", order=goals_by_gender['Goal'].value_counts().index, palette="Set1")
plt.xticks(rotation=45, ha='right')
plt.title("Investment Goals by Gender")
plt.savefig("investment_goals_by_gender.png", dpi=300, bbox_inches="tight")
plt.close()

# Factors by Age Group
df['Age_Group'] = pd.cut(df['age'], bins=[18,25,35,45,60,100], labels=['18-25','26-35','36-45','46-60','60+'])
factors_by_age = df.dropna(subset=['Age_Group','Factor']).copy()
factors_by_age = factors_by_age.assign(Factor=factors_by_age['Factor'].str.split(',')).explode('Factor')
factors_by_age['Factor'] = factors_by_age['Factor'].str.strip()

plt.figure(figsize=(12,6))
sns.countplot(data=factors_by_age, x="Factor", hue="Age_Group", order=factors_by_age['Factor'].value_counts().index, palette="tab20")
plt.xticks(rotation=45, ha='right')
plt.title("Factors Considered by Age Group")
plt.savefig("factors_by_age_group.png", dpi=300, bbox_inches="tight")
plt.close()

# -------------------------------
# Step 5: Ranking Preferences
# -------------------------------

# Average rank across investment options
ranking_cols = ["Mutual_Funds","Equity_Market","Debentures","Government_Bonds","Fixed_Deposits","PPF","Gold"]
avg_ranks = df[ranking_cols].mean().sort_values()

plt.figure(figsize=(8,5))
sns.barplot(x=avg_ranks.values, y=avg_ranks.index, palette="crest")
plt.title("Average Ranking of Investment Options (Lower = Preferred)")
plt.xlabel("Average Rank")
plt.ylabel("Investment Option")
plt.savefig("investment_option_rankings.png", dpi=300, bbox_inches="tight")
plt.close()
