In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import glob

# -----------------------------
# 📥 Step 1: Load economic + population data
# -----------------------------
eco_df = pd.read_csv("new-eco.csv")
pop_df = pd.read_excel("onlypopulation-cleaned.xlsx")

eco_df.columns = eco_df.columns.map(str.strip)
pop_df.columns = pop_df.columns.map(str.strip)

# Rename merge column if needed
if 'SA2 Name' in pop_df.columns:
    pop_df.rename(columns={"SA2 Name": "SA2_Name"}, inplace=True)
if 'SA2 Name' in eco_df.columns:
    eco_df.rename(columns={"SA2 Name": "SA2_Name"}, inplace=True)

# Merge population and economic data
merged_df = pd.merge(eco_df, pop_df, on="SA2_Name", how="inner")

# -----------------------------
# 🚆 Step 2: Load Opal patronage .txt files
# -----------------------------
opal_files = glob.glob("Opal_Patronage_*.txt")
opal_dfs = []

for file in opal_files:
    df = pd.read_csv(file, delimiter="|")
    df.columns = df.columns.str.strip()
    # Adjust to your actual column names
    if {'ti_location_name', 'trip_count'}.issubset(df.columns):
        opal_dfs.append(df[['ti_location_name', 'trip_count']])

# Combine and aggregate all patronage data
opal_all = pd.concat(opal_dfs, ignore_index=True)
opal_grouped = opal_all.groupby("ti_location_name")["trip_count"].sum().reset_index()
opal_grouped.rename(columns={"ti_location_name": "SA2_Name", "trip_count": "Total_Trips"}, inplace=True)

# -----------------------------
# 🔁 Step 3: Merge all together
# -----------------------------
final_df = pd.merge(merged_df, opal_grouped, on="SA2_Name", how="left")
final_df["Total_Trips"].fillna(0, inplace=True)

# -----------------------------
# 📊 Step 4: Correlation and Visuals
# -----------------------------
sns.scatterplot(data=final_df, x="Pop_Growth_Percent", y="Total_Trips")
plt.title("Population Growth vs Opal Trips")
plt.xlabel("Population Growth (%)")
plt.ylabel("Total Opal Trips")
plt.grid(True)
plt.show()

correlation = final_df["Pop_Growth_Percent"].corr(final_df["Total_Trips"])
print(f"🔗 Correlation: {correlation:.3f}")

# -----------------------------
# 🤖 Step 5: Clustering SA2 areas
# -----------------------------
features = final_df[["Pop_Growth_Percent", "Total_Trips"]].fillna(0)
scaled = StandardScaler().fit_transform(features)

kmeans = KMeans(n_clusters=3, random_state=42)
final_df["Cluster"] = kmeans.fit_predict(scaled)

sns.scatterplot(data=final_df, x="Pop_Growth_Percent", y="Total_Trips", hue="Cluster", palette="viridis")
plt.title("Clustering SA2 Areas by Growth & Transport")
plt.show()


TypeError: descriptor 'strip' for 'str' objects doesn't apply to a 'int' object