# Assignment 2


In [11]:
import numpy as np
import pandas as pd



In [15]:
import os
os.getcwd()
df = pd.read_csv("economic_data.csv")
df.head()

Unnamed: 0,country,year,gdp,unemployment,inflation
0,Canada,2024,2.1,5.4,3.9
1,USA,2024,2.6,3.8,3.2
2,UK,2024,1.4,4.2,4.7
3,Germany,2024,0.8,3.1,2.8
4,Japan,2024,1.0,2.6,2.3


## Part 1: NumPy Foundations


## Question 1

In [16]:
#Input and Inspect the Data

country = df["country"].to_numpy()
year = df["year"].to_numpy()
gdp = df["gdp"].to_numpy()
unemployment = df["unemployment"].to_numpy()
inflation = df["inflation"].to_numpy()

def inspect(arr, name):
    print("\n", name)
    print("shape:", arr.shape)
    print("ndim:", arr.ndim)
    print("size:", arr.size)
    print("dtype:", arr.dtype)
    print("first:", arr[0])
    print("last:", arr[-1])

inspect(country, "country")
inspect(year, "year")
inspect(gdp, "gdp")
inspect(unemployment, "unemployment")
inspect(inflation, "inflation")


 country
shape: (5,)
ndim: 1
size: 5
dtype: object
first: Canada
last: Japan

 year
shape: (5,)
ndim: 1
size: 5
dtype: int64
first: 2024
last: 2024

 gdp
shape: (5,)
ndim: 1
size: 5
dtype: float64
first: 2.1
last: 1.0

 unemployment
shape: (5,)
ndim: 1
size: 5
dtype: float64
first: 5.4
last: 2.6

 inflation
shape: (5,)
ndim: 1
size: 5
dtype: float64
first: 3.9
last: 2.3


## Question 2

In [17]:
#Type Control

year = year.astype(float)
gdp = gdp.astype(float)
unemployment = unemployment.astype(float)
inflation = inflation.astype(float)

print(year.dtype)
print(gdp.dtype)
print(unemployment.dtype)
print(inflation.dtype)

joined = np.concatenate([
    country.astype(str),
    year.astype(str),
    gdp.astype(str),
    unemployment.astype(str),
    inflation.astype(str)
])

print(joined.dtype)
print(joined)


float64
float64
float64
float64
<U32
['Canada' 'USA' 'UK' 'Germany' 'Japan' '2024.0' '2024.0' '2024.0' '2024.0'
 '2024.0' '2.1' '2.6' '1.4' '0.8' '1.0' '5.4' '3.8' '4.2' '3.1' '2.6'
 '3.9' '3.2' '4.7' '2.8' '2.3']


Type control was applied by explicitly converting numeric arrays to float to ensure consistency. During concatenation, NumPy coerced all elements to string because arrays must share a common data type.

## Question 3

In [18]:
#Numeric Matrices & Slicing

numeric2d_array = np.column_stack([gdp, unemployment, inflation])

print(numeric2d_array)
print("Shape:", numeric2d_array.shape)

first_two_rows = numeric2d_array[0:1, :]
last_two_rows = numeric2d_array[:, 1:3]
inflation_every_second = numeric2d_array[0:1, 2:5:2]

print(first_two_rows)
print(last_two_rows)
print(inflation_every_second)

[[2.1 5.4 3.9]
 [2.6 3.8 3.2]
 [1.4 4.2 4.7]
 [0.8 3.1 2.8]
 [1.  2.6 2.3]]
Shape: (5, 3)
[[2.1 5.4 3.9]]
[[5.4 3.9]
 [3.8 3.2]
 [4.2 4.7]
 [3.1 2.8]
 [2.6 2.3]]
[[3.9]]


## Question 4

In [19]:
#Boolean Masks and Conditional Replacement

title = "Economic Indicators"
print(title)

gdp_mask = gdp > 1.5
inflation_mask = inflation >= 4

print(gdp_mask)
print(inflation_mask)

combined_mask = gdp_mask + inflation_mask
print(combined_mask)

filtered = numeric2d_array[combined_mask]
print(filtered)

inflation_label = np.where(inflation >= 4.0, "High", "Low")
print(inflation_label)

Economic Indicators
[ True  True False False False]
[False False  True False False]
[ True  True  True False False]
[[2.1 5.4 3.9]
 [2.6 3.8 3.2]
 [1.4 4.2 4.7]]
['Low' 'Low' 'High' 'Low' 'Low']


## Question 5

In [20]:
#Broadcasting Adjustments
numeric_array = np.column_stack([gdp, unemployment, inflation])

#Before
print("Before GDP:", numeric_array[:,0])
print("Before Inflation:", numeric_array[:, 2])

#broadcasting
shock = np.array([0.2, 0.0, 0.3])
numeric_shock_array = numeric_array + shock

#After
print("After GDP:", numeric_shock_array[:,0])
print("After Inflation:", numeric_shock_array[:, 0])

print(numeric_shock_array)

Before GDP: [2.1 2.6 1.4 0.8 1. ]
Before Inflation: [3.9 3.2 4.7 2.8 2.3]
After GDP: [2.3 2.8 1.6 1.  1.2]
After Inflation: [2.3 2.8 1.6 1.  1.2]
[[2.3 5.4 4.2]
 [2.8 3.8 3.5]
 [1.6 4.2 5. ]
 [1.  3.1 3.1]
 [1.2 2.6 2.6]]


Broadcasting works here because the shock vector has length 3 (one value per column), so NumPy automatically adds it to every row of the 2D array.

## Question 6

In [21]:
#Aggregation Along Axes

column_min = numeric2d_array.min(axis=0)
column_max = numeric2d_array.max(axis=0)
column_mean = numeric2d_array.mean(axis=0)
column_std = numeric2d_array.std(axis=0)

print("Column min:", column_min)
print("Column max:", column_max)
print("Column mean:", column_mean)
print("Column std:", column_std)

row_sums = numeric2d_array.sum(axis=1)
row_cumsum = numeric2d_array.cumsum(axis=1)

print("Row sums:", row_sums)
print("Row cumsum:", row_cumsum)

Column min: [0.8 2.6 2.3]
Column max: [2.6 5.4 4.7]
Column mean: [1.58 3.82 3.38]
Column std: [0.67646138 0.96415766 0.84237759]
Row sums: [11.4  9.6 10.3  6.7  5.9]
Row cumsum: [[ 2.1  7.5 11.4]
 [ 2.6  6.4  9.6]
 [ 1.4  5.6 10.3]
 [ 0.8  3.9  6.7]
 [ 1.   3.6  5.9]]


Axis=0 computes statistics down the rows for each column, while axis=1 computes statistics across columns for each row.

## Question 7

In [22]:
#Set Operations

countries1 = np.array(["Canada", "USA", "UK"])
countries2 = np.array(["UK", "Germany", "Japan"])

print(countries1)
print(countries2)

intersection = np.intersect1d(countries1, countries2)
union = np.union1d(countries1, countries2)
difference1 = np.setdiff1d(countries1, countries2)
difference2 = np.setdiff1d(countries2, countries1)
print("Intersection:", intersection)
print("Union:", union)
print("Difference1:", difference1)
print("Difference2:", difference2)

['Canada' 'USA' 'UK']
['UK' 'Germany' 'Japan']
Intersection: ['UK']
Union: ['Canada' 'Germany' 'Japan' 'UK' 'USA']
Difference1: ['Canada' 'USA']
Difference2: ['Germany' 'Japan']


## Question 8

In [23]:
#RNG Sampling Shuffling

rng= np.random.default_rng(seed=1)

gdp_sample_norep = rng.choice(gdp, size=3, replace=False)
print(gdp_sample_norep)
gdp_sample_rep = rng.choice(gdp, size=9, replace=True).reshape(3,3)
print(gdp_sample_rep)

gdp_shuffled_rows = gdp_sample_rep.copy()  
rng.shuffle(gdp_shuffled_rows, axis=1)
print(gdp_shuffled_rows)

[1.4 2.6 0.8]
[[2.1 1.  1. ]
 [2.6 2.6 1. ]
 [1.4 2.6 1. ]]
[[1.  2.1 1. ]
 [2.6 2.6 1. ]
 [2.6 1.4 1. ]]


## Question 9

In [24]:
#Save array

np.save('numeric_shock.npy', numeric_shock_array)

In [25]:
#load array
loaded_shock = np.load("numeric_shock.npy")
print(loaded_shock)

assert np.array_equal(numeric_shock_array, loaded_shock)
print("Arrays are equal")



[[2.3 5.4 4.2]
 [2.8 3.8 3.5]
 [1.6 4.2 5. ]
 [1.  3.1 3.1]
 [1.2 2.6 2.6]]
Arrays are equal


In [26]:
all_data_array = np.column_stack([
    country,
    year,
    gdp,
    unemployment,
    inflation
])

np.savetxt("economic_data_numpy.csv", all_data_array, delimiter=',', fmt='%s')

## Part 2: Pandas Tasks

## Question 1

In [50]:
#Load and Inspect

df = pd.read_csv("economic_data.csv", delimiter=',', index_col=False)
print(df.head())

   country  year  gdp  unemployment  inflation
0   Canada  2024  2.1           5.4        3.9
1      USA  2024  2.6           3.8        3.2
2       UK  2024  1.4           4.2        4.7
3  Germany  2024  0.8           3.1        2.8
4    Japan  2024  1.0           2.6        2.3


In [28]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       5 non-null      object 
 1   year          5 non-null      int64  
 2   gdp           5 non-null      float64
 3   unemployment  5 non-null      float64
 4   inflation     5 non-null      float64
dtypes: float64(3), int64(1), object(1)
memory usage: 328.0+ bytes
None


In [29]:
print(df.dtypes)

country          object
year              int64
gdp             float64
unemployment    float64
inflation       float64
dtype: object


## Question 2

In [30]:
#Selection, Filter, Sort

df_sel = df[["country", "gdp", "unemployment"]]
print(df_sel)

df_filtered = df[df["unemployment"] <= 4.0]
print("\nFiltered (unemployment <= 4.0):")
print(df_filtered)

df_sorted = df_filtered.sort_values(by=["gdp", "inflation"], ascending=[False, True])
print("\nSorted (gdp desc, inflation asc):")
print(df_sorted)

   country  gdp  unemployment
0   Canada  2.1           5.4
1      USA  2.6           3.8
2       UK  1.4           4.2
3  Germany  0.8           3.1
4    Japan  1.0           2.6

Filtered (unemployment <= 4.0):
   country  year  gdp  unemployment  inflation
1      USA  2024  2.6           3.8        3.2
3  Germany  2024  0.8           3.1        2.8
4    Japan  2024  1.0           2.6        2.3

Sorted (gdp desc, inflation asc):
   country  year  gdp  unemployment  inflation
1      USA  2024  2.6           3.8        3.2
4    Japan  2024  1.0           2.6        2.3
3  Germany  2024  0.8           3.1        2.8


## Question 3

In [31]:
#Derived Columns

def categorize_gdp(gdp):
    if gdp < 1.5:
        return 'Low'
    elif 1.5 <= gdp <= 2.5:
        return 'Moderate'
    else:
        return 'High'
df['gdp_categorized'] = df['gdp'].apply(categorize_gdp)
print(df[['country', 'year', 'gdp', 'unemployment', 'inflation', 'gdp_categorized']])


   country  year  gdp  unemployment  inflation gdp_categorized
0   Canada  2024  2.1           5.4        3.9        Moderate
1      USA  2024  2.6           3.8        3.2            High
2       UK  2024  1.4           4.2        4.7             Low
3  Germany  2024  0.8           3.1        2.8             Low
4    Japan  2024  1.0           2.6        2.3             Low


In [32]:
df["real_gdp_proxy"] = (df["gdp"] / df["inflation"]).round(2)

print(df[['country', 'year', 'gdp', 'inflation', 'real_gdp_proxy']])


   country  year  gdp  inflation  real_gdp_proxy
0   Canada  2024  2.1        3.9            0.54
1      USA  2024  2.6        3.2            0.81
2       UK  2024  1.4        4.7            0.30
3  Germany  2024  0.8        2.8            0.29
4    Japan  2024  1.0        2.3            0.43


## Question 4

In [33]:
#Conditional Replacement

df["inflation_label"] = np.where(df["inflation"] >= 4.0, "Rapid", "Normal")

print(df[['country', 'inflation', 'inflation_label']])

   country  inflation inflation_label
0   Canada        3.9          Normal
1      USA        3.2          Normal
2       UK        4.7           Rapid
3  Germany        2.8          Normal
4    Japan        2.3          Normal


## Question 5

In [34]:
#Build 2023 Data and Group

df_2024 = df.copy()

df_2023 = df_2024.copy()
df_2023['year'] = df_2023['year'] - 1
df_2023['gdp'] = df_2023['gdp'] - 0.2
df_2023['unemployment'] = df_2023['unemployment'] - 0.1
df_2023['inflation'] = df_2023['inflation'] - 0.1
df_2023.head()



Unnamed: 0,country,year,gdp,unemployment,inflation,gdp_categorized,real_gdp_proxy,inflation_label
0,Canada,2023,1.9,5.3,3.8,Moderate,0.54,Normal
1,USA,2023,2.4,3.7,3.1,High,0.81,Normal
2,UK,2023,1.2,4.1,4.6,Low,0.3,Rapid
3,Germany,2023,0.6,3.0,2.7,Low,0.29,Normal
4,Japan,2023,0.8,2.5,2.2,Low,0.43,Normal


In [35]:
df_concat = pd.concat([df_2023, df_2024], ignore_index=True)

print("\nCombined Data (2023 + 2024):")
print(df_concat)




Combined Data (2023 + 2024):
   country  year  gdp  unemployment  inflation gdp_categorized  \
0   Canada  2023  1.9           5.3        3.8        Moderate   
1      USA  2023  2.4           3.7        3.1            High   
2       UK  2023  1.2           4.1        4.6             Low   
3  Germany  2023  0.6           3.0        2.7             Low   
4    Japan  2023  0.8           2.5        2.2             Low   
5   Canada  2024  2.1           5.4        3.9        Moderate   
6      USA  2024  2.6           3.8        3.2            High   
7       UK  2024  1.4           4.2        4.7             Low   
8  Germany  2024  0.8           3.1        2.8             Low   
9    Japan  2024  1.0           2.6        2.3             Low   

   real_gdp_proxy inflation_label  
0            0.54          Normal  
1            0.81          Normal  
2            0.30           Rapid  
3            0.29          Normal  
4            0.43          Normal  
5            0.54          

In [36]:
df_stats = df_concat.groupby('country', as_index=False).agg(
mean_gdp = ('gdp', 'mean'),
mean_unemployment = ('unemployment', 'mean'))
print(df_stats)

   country  mean_gdp  mean_unemployment
0   Canada       2.0               5.35
1  Germany       0.7               3.05
2    Japan       0.9               2.55
3       UK       1.3               4.15
4      USA       2.5               3.75


## Question 6

In [37]:
#Basic Merge

country_to_region = pd.DataFrame({
    'country': ['Canada', 'Germany', 'Japan', 'UK', 'USA'],
    'region': ['North America', 'Europe', 'Asia', 'Europe', 'North America']
})
 
df_merged = df_stats.merge(country_to_region, on="country", how="left")
print(df_merged)

   country  mean_gdp  mean_unemployment         region
0   Canada       2.0               5.35  North America
1  Germany       0.7               3.05         Europe
2    Japan       0.9               2.55           Asia
3       UK       1.3               4.15         Europe
4      USA       2.5               3.75  North America


## Question 7

In [38]:

from datetime import date
from datetime import timedelta

print(df_concat.columns)
print(df_concat[['country','year','gdp','gdp_categorized']].head(10))
print(df_concat['gdp_categorized'].value_counts(dropna=False))


Index(['country', 'year', 'gdp', 'unemployment', 'inflation',
       'gdp_categorized', 'real_gdp_proxy', 'inflation_label'],
      dtype='object')
   country  year  gdp gdp_categorized
0   Canada  2023  1.9        Moderate
1      USA  2023  2.4            High
2       UK  2023  1.2             Low
3  Germany  2023  0.6             Low
4    Japan  2023  0.8             Low
5   Canada  2024  2.1        Moderate
6      USA  2024  2.6            High
7       UK  2024  1.4             Low
8  Germany  2024  0.8             Low
9    Japan  2024  1.0             Low
gdp_categorized
Low         6
Moderate    2
High        2
Name: count, dtype: int64


In [39]:
#Datetime, Categorical and String

df_concat['date_jan1'] = pd.to_datetime(df_concat['year'].astype(str) + '-01-01')
print(df_concat.head())




   country  year  gdp  unemployment  inflation gdp_categorized  \
0   Canada  2023  1.9           5.3        3.8        Moderate   
1      USA  2023  2.4           3.7        3.1            High   
2       UK  2023  1.2           4.1        4.6             Low   
3  Germany  2023  0.6           3.0        2.7             Low   
4    Japan  2023  0.8           2.5        2.2             Low   

   real_gdp_proxy inflation_label  date_jan1  
0            0.54          Normal 2023-01-01  
1            0.81          Normal 2023-01-01  
2            0.30           Rapid 2023-01-01  
3            0.29          Normal 2023-01-01  
4            0.43          Normal 2023-01-01  


In [40]:
df_concat['date_dec31'] = df_concat['date_jan1'] + pd.offsets.YearEnd(0)
print(df_concat[['country', 'year', 'date_jan1', 'date_dec31']].head())

   country  year  date_jan1 date_dec31
0   Canada  2023 2023-01-01 2023-12-31
1      USA  2023 2023-01-01 2023-12-31
2       UK  2023 2023-01-01 2023-12-31
3  Germany  2023 2023-01-01 2023-12-31
4    Japan  2023 2023-01-01 2023-12-31


In [41]:
df_concat['gdp_categorized'] = df_concat['gdp_categorized'].astype('category')
print(df_concat.groupby('gdp_categorized').describe())


                 year                                                    \
                count    mean     min      25%     50%      75%     max   
gdp_categorized                                                           
High              2.0  2023.5  2023.0  2023.25  2023.5  2023.75  2024.0   
Low               6.0  2023.5  2023.0  2023.00  2023.5  2024.00  2024.0   
Moderate          2.0  2023.5  2023.0  2023.25  2023.5  2023.75  2024.0   

                            gdp            ...            date_jan1       \
                      std count      mean  ...                  max  std   
gdp_categorized                            ...                             
High             0.707107   2.0  2.500000  ...  2024-01-01 00:00:00  NaN   
Low              0.547723   6.0  0.966667  ...  2024-01-01 00:00:00  NaN   
Moderate         0.707107   2.0  2.000000  ...  2024-01-01 00:00:00  NaN   

                date_dec31                                            \
                    

In [42]:
df_concat.groupby('gdp_categorized')[['unemployment', 'inflation']].mean()
print(df_concat.groupby('gdp_categorized')[['unemployment', 'inflation']].mean())

                 unemployment  inflation
gdp_categorized                         
High                     3.75   3.150000
Low                      3.25   3.216667
Moderate                 5.35   3.850000


In [43]:
df_concat['concat'] = df_concat['country'].str.lower()
print(df_concat['country'].head())


0     Canada
1        USA
2         UK
3    Germany
4      Japan
Name: country, dtype: object


## Question 8

In [44]:
df_concat.to_csv('DataFrame.csv', index=False)
!ls

Assignment1.ipynb  DataFrame.csv      economic_data_numpy.csv
Assignment2.ipynb  economic_data.csv  numeric_shock.npy


In [45]:
df_concat.to_excel('DataFrame.xlsx', index=False)
!ls

Assignment1.ipynb  DataFrame.csv   economic_data.csv	    numeric_shock.npy
Assignment2.ipynb  DataFrame.xlsx  economic_data_numpy.csv


In [46]:
df_concat.to_stata('DataFrame.dta', write_index=False)
!ls

Assignment1.ipynb  DataFrame.csv  DataFrame.xlsx     economic_data_numpy.csv
Assignment2.ipynb  DataFrame.dta  economic_data.csv  numeric_shock.npy


In [47]:
df_concat.to_pickle('DataFrame.pkl')
!ls

Assignment1.ipynb  DataFrame.dta   economic_data.csv
Assignment2.ipynb  DataFrame.pkl   economic_data_numpy.csv
DataFrame.csv	   DataFrame.xlsx  numeric_shock.npy


In [48]:
df_concat.to_json('DataFrame.json', orient='table')
!ls

Assignment1.ipynb  DataFrame.dta   DataFrame.xlsx	    numeric_shock.npy
Assignment2.ipynb  DataFrame.json  economic_data.csv
DataFrame.csv	   DataFrame.pkl   economic_data_numpy.csv


In [49]:
df.to_hdf('DataFrame.h5', key='df', mode='w', format='table')
!ls


Assignment1.ipynb  DataFrame.dta   DataFrame.pkl      economic_data_numpy.csv
Assignment2.ipynb  DataFrame.h5    DataFrame.xlsx     numeric_shock.npy
DataFrame.csv	   DataFrame.json  economic_data.csv
