In [4]:
import pandas as pd
import numpy as np

# We already have our cleaned & filtered Pandas DataFrames (employment_sa_clean.csv and poverty_sa_clean.csv) from our pipeline, now we convert them into NumPy arrays for numerical operations:
employment = pd.read_csv(r"C:\Users\Admin\Desktop\GroupX_DataAnalysis\NDTA631-DataAnalysis-GroupX\data\processed\employment_sa_clean.csv")
poverty = pd.read_csv(r"C:\Users\Admin\Desktop\GroupX_DataAnalysis\NDTA631-DataAnalysis-GroupX\data\processed\poverty_sa_clean.csv")

# Convert selected numeric columns to NumPy arrays
employment_array = employment.select_dtypes(include=[np.number]).to_numpy()
poverty_array = poverty.select_dtypes(include=[np.number]).to_numpy()

print("Employment shape:", employment_array.shape)
print("Poverty shape:", poverty_array.shape)

#Now we reshape the arrays
# Example: reshape to 2D (rows, cols)
employment_reshaped = employment_array.reshape(employment_array.shape[0], -1)
poverty_reshaped = poverty_array.reshape(poverty_array.shape[0], -1)

print("Reshaped employment:", employment_reshaped.shape)
print("Reshaped poverty:", poverty_reshaped.shape)



Employment shape: (1, 66)
Poverty shape: (4, 5)
Reshaped employment: (1, 66)
Reshaped poverty: (4, 5)


In [7]:
#This cell will show how we perform useful operations for analysis
# Mean, median, standard deviation
employment_mean = np.mean(employment_array, axis=0)
poverty_mean = np.mean(poverty_array, axis=0)

employment_std = np.std(employment_array, axis=0)
poverty_std = np.std(poverty_array, axis=0)

print("\n--- Employment Stats ---")
print("Mean values:", employment_mean)
print("Standard deviation:", employment_std)

print("\n--- Poverty Stats ---")
print("Mean values:", poverty_mean)
print("Standard deviation:", poverty_std)

# Element-wise operations (if same shape)
if employment_array.shape == poverty_array.shape:
    diff = employment_array - poverty_array
    correlation = np.corrcoef(employment_array.flatten(), poverty_array.flatten())[0,1]
    print("Correlation between employment & poverty:", correlation)
 


--- Employment Stats ---
Mean values: [ 2.          0.         60.079      59.64033333 59.482      59.01
 58.92033333 59.10333333 59.583      59.90166667 60.049      60.255
 58.377      52.13275    52.28475    56.5716     52.77075    48.41745455
 52.77846036 50.0437534  51.29031831 51.44413834 51.41857857 51.13788407
 52.09399587 53.29276358 51.89257801 52.52918786 52.21640828 54.45103007
 54.60475466 56.26305351 54.81168621 55.8611311  55.48923607 55.39511324
 38.2        37.8        34.8        27.7        34.         36.8
 35.587      44.08       42.834      41.511      41.593      43.376
 44.896      44.43       43.286      41.264      39.365      39.317
 39.524      40.041      40.044      40.862      40.195      40.393
 40.319      39.534      35.779      34.332      35.701      37.359     ]
Standard deviation: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.

In [8]:
# Findings
print("\nInterpretation:")
print("1. The mean values show the central tendency for employment and poverty indicators.")
print("2. Standard deviation highlights variability — higher std means more fluctuations.")
print("3. If correlation is negative, it suggests that as employment increases, poverty decreases.")
print("4. Reshaping ensures arrays are in a form suitable for mathematical operations/comparisons.")


Interpretation:
1. The mean values show the central tendency for employment and poverty indicators.
2. Standard deviation highlights variability — higher std means more fluctuations.
3. If correlation is negative, it suggests that as employment increases, poverty decreases.
4. Reshaping ensures arrays are in a form suitable for mathematical operations/comparisons.
