In [1]:
import pandas as pd
import csv

def custom_to_csv(data, name):
    data.to_csv(name, index=False)
    f = open(name, "r+")
    lines = f.readlines()
    lines[-1] = lines[-1].rstrip()
    f.close()
    f = open(name, "w+")
    f.writelines(lines)
    f.close()

In [15]:
# Read the FinancialJournal CSV file into a DataFrame
finance_df = pd.read_csv("../data/Datasets/Journals/FinancialJournal.csv")
finance_df

Unnamed: 0,participantId,timestamp,category,amount
0,0,2022-03-01T00:00:00Z,Wage,2472.507559
1,0,2022-03-01T00:00:00Z,Shelter,-554.988622
2,0,2022-03-01T00:00:00Z,Education,-38.005380
3,1,2022-03-01T00:00:00Z,Wage,2046.562206
4,1,2022-03-01T00:00:00Z,Shelter,-554.988622
...,...,...,...,...
1856325,39,2023-05-25T00:05:00Z,Recreation,-2.760368
1856326,28,2023-05-25T00:05:00Z,Recreation,-29.458409
1856327,370,2023-05-25T00:05:00Z,Recreation,-28.444239
1856328,537,2023-05-25T00:05:00Z,Food,-4.000000


In [16]:
# count participants that dropped out after just a month, we know they are 131
count_fin = finance_df["participantId"].value_counts().reset_index().sort_values("count")
dropped_out = count_fin.head(131)
custom_to_csv(dropped_out["participantId"], "DroppedOut.csv")
dropped_out

Unnamed: 0,participantId,count
1010,653,11
1009,875,11
1008,346,12
1004,279,14
1007,846,14
...,...,...
882,514,26
884,285,26
885,621,26
881,762,27


In [17]:
# Take the absolute value of the 'amount' column
finance_df["amount"] = finance_df["amount"].abs()
finance_df.head()

Unnamed: 0,participantId,timestamp,category,amount
0,0,2022-03-01T00:00:00Z,Wage,2472.507559
1,0,2022-03-01T00:00:00Z,Shelter,554.988622
2,0,2022-03-01T00:00:00Z,Education,38.00538
3,1,2022-03-01T00:00:00Z,Wage,2046.562206
4,1,2022-03-01T00:00:00Z,Shelter,554.988622


In [18]:
# Group by 'participantId' and 'category', summing the 'amount' for each group
result_df = finance_df.groupby(["participantId", "category"])["amount"].sum().reset_index()
result_df.head()

Unnamed: 0,participantId,category,amount
0,0,Education,608.086073
1,0,Food,3868.580758
2,0,Recreation,4870.355051
3,0,Shelter,8879.817947
4,0,Wage,134904.668457


In [21]:
# Write the result to a new CSV file
custom_to_csv(result_df, "AggregatedFinancialJournal.csv")

<hr/>

In [22]:
# read the participants CSV file into a DataFrame
part_df = pd.read_csv("../data/Datasets/Attributes/Participants.csv")
part_df.head()

Unnamed: 0,participantId,householdSize,haveKids,age,educationLevel,interestGroup,joviality
0,0,3,True,36,HighSchoolOrCollege,H,0.001627
1,1,3,True,25,HighSchoolOrCollege,B,0.328087
2,2,3,True,35,HighSchoolOrCollege,A,0.39347
3,3,3,True,21,HighSchoolOrCollege,I,0.138063
4,4,3,True,43,Bachelors,H,0.857397


In [23]:
total_exp = result_df[(result_df.category != "Wage") & (result_df.category != "RentAdjustment")].groupby("participantId")["amount"].sum()
total_exp

participantId
0       18226.839829
1       21139.514857
2       18363.684618
3       19327.114335
4       33132.259641
            ...     
1006    19239.292195
1007    24756.701369
1008    14491.497800
1009    22273.255338
1010    19331.243416
Name: amount, Length: 1011, dtype: float64

In [24]:
food_exp = result_df[result_df.category == "Food"].groupby("participantId")["amount"].sum()
food_exp

participantId
0       3868.580758
1       3912.751140
2       4265.218707
3       4189.696815
4       4017.282757
           ...     
1006    6559.485799
1007    6644.145592
1008    4306.678867
1009    7924.229957
1010    4284.186971
Name: amount, Length: 1011, dtype: float64

In [25]:
engel_coeff = (food_exp / total_exp).rename("engels")
engel_coeff

participantId
0       0.212246
1       0.185092
2       0.232264
3       0.216778
4       0.121250
          ...   
1006    0.340942
1007    0.268378
1008    0.297187
1009    0.355773
1010    0.221620
Name: engels, Length: 1011, dtype: float64

In [26]:
part_df_augmented = part_df.merge(engel_coeff, on="participantId")
part_df_augmented["haveKids"] = part_df_augmented["haveKids"].map({True: "TRUE", False: "FALSE"})
part_df_augmented

Unnamed: 0,participantId,householdSize,haveKids,age,educationLevel,interestGroup,joviality,engels
0,0,3,TRUE,36,HighSchoolOrCollege,H,0.001627,0.212246
1,1,3,TRUE,25,HighSchoolOrCollege,B,0.328087,0.185092
2,2,3,TRUE,35,HighSchoolOrCollege,A,0.393470,0.232264
3,3,3,TRUE,21,HighSchoolOrCollege,I,0.138063,0.216778
4,4,3,TRUE,43,Bachelors,H,0.857397,0.121250
...,...,...,...,...,...,...,...,...
1006,1006,1,FALSE,19,HighSchoolOrCollege,J,0.639268,0.340942
1007,1007,1,FALSE,40,HighSchoolOrCollege,B,0.934348,0.268378
1008,1008,1,FALSE,23,Graduate,C,0.163721,0.297187
1009,1009,1,FALSE,39,Low,B,0.828330,0.355773


In [None]:
custom_to_csv(part_df_augmented, "ParticipantsAugmented.csv")

<hr/>

In [64]:
participants = pd.read_csv("ParticipantsAugmented.csv")
participants

Unnamed: 0,participantId,householdSize,haveKids,age,educationLevel,interestGroup,joviality,engels
0,0,3,True,36,HighSchoolOrCollege,H,0.001627,0.212246
1,1,3,True,25,HighSchoolOrCollege,B,0.328087,0.185092
2,2,3,True,35,HighSchoolOrCollege,A,0.393470,0.232264
3,3,3,True,21,HighSchoolOrCollege,I,0.138063,0.216778
4,4,3,True,43,Bachelors,H,0.857397,0.121250
...,...,...,...,...,...,...,...,...
1006,1006,1,False,19,HighSchoolOrCollege,J,0.639268,0.340942
1007,1007,1,False,40,HighSchoolOrCollege,B,0.934348,0.268378
1008,1008,1,False,23,Graduate,C,0.163721,0.297187
1009,1009,1,False,39,Low,B,0.828330,0.355773


In [65]:
homes = pd.read_csv("homes.csv")[["participantId", "apartments"]]
homes

Unnamed: 0,participantId,apartments
0,0,926
1,1,928
2,2,291
3,3,1243
4,4,19421029375136
...,...,...
1006,1006,1633
1007,1007,1620
1008,1008,525
1009,1009,663


In [66]:
def split_apartments(x):
    if type(x) == float:
        return pd.NA # NaN are now pandas.NA
    else:
        return int(x.split(",")[-1])

homes["apartmentId"] = homes["apartments"].apply(split_apartments)
homes.drop("apartments", axis=1, inplace=True)
homes

Unnamed: 0,participantId,apartmentId
0,0,926
1,1,928
2,2,291
3,3,1243
4,4,136
...,...,...
1006,1006,1633
1007,1007,1620
1008,1008,525
1009,1009,663


In [67]:
apartments = pd.read_csv("ApartmentsCoord.csv")
apartments

Unnamed: 0,apartmentId,rentalCost,maxOccupancy,numberOfRooms,locationX,locationY,buildingId
0,1,768.16,2,4,1077.697944,648.442716,340
1,2,1014.55,2,1,-185.929284,1520.327098,752
2,3,1057.39,4,3,2123.014186,5126.753457,639
3,4,1259.10,4,3,2103.630178,4266.932930,397
4,5,411.50,1,4,7.058974,79.961637,628
...,...,...,...,...,...,...,...
1512,1729,703.80,2,4,-4097.077997,7409.520667,182
1513,1730,1104.62,4,2,-4334.244917,7164.441147,613
1514,1731,890.69,4,1,-3330.585557,7588.646019,513
1515,1732,703.80,2,3,-4151.807872,7409.520667,182


In [68]:
homesJoined = homes.merge(apartments, on="apartmentId", how="left")
homesJoined.drop(["rentalCost", "maxOccupancy", "numberOfRooms", "buildingId"], axis=1, inplace=True)
homesJoined

Unnamed: 0,participantId,apartmentId,locationX,locationY
0,0,926.0,-2721.353174,6862.861219
1,1,928.0,-1531.132885,5597.24422
2,2,291.0,-1352.967752,2095.77944
3,3,1243.0,-1550.046162,5606.726709
4,4,136.0,1488.478248,3490.893224
5,5,243.0,-1511.444069,1994.482112
6,6,183.0,1789.415305,3245.310488
7,7,97.0,-1025.463021,1571.165821
8,8,321.0,611.33466,2266.011784
9,9,355.0,-2025.813803,2039.399099


In [74]:
participantsJoined = participants.merge(homesJoined, on="participantId", how="left")
participantsJoined

Unnamed: 0,participantId,householdSize,haveKids,age,educationLevel,interestGroup,joviality,engels,apartmentId,locationX,locationY
0,0,3,True,36,HighSchoolOrCollege,H,0.001627,0.212246,926,-2721.353174,6862.861219
1,1,3,True,25,HighSchoolOrCollege,B,0.328087,0.185092,928,-1531.132885,5597.244220
2,2,3,True,35,HighSchoolOrCollege,A,0.393470,0.232264,291,-1352.967752,2095.779440
3,3,3,True,21,HighSchoolOrCollege,I,0.138063,0.216778,1243,-1550.046162,5606.726709
4,4,3,True,43,Bachelors,H,0.857397,0.121250,136,1488.478248,3490.893224
...,...,...,...,...,...,...,...,...,...,...,...
1006,1006,1,False,19,HighSchoolOrCollege,J,0.639268,0.340942,1633,-4227.336286,5302.398117
1007,1007,1,False,40,HighSchoolOrCollege,B,0.934348,0.268378,1620,-3969.050409,6563.139153
1008,1008,1,False,23,Graduate,C,0.163721,0.297187,525,-2158.723147,2525.141681
1009,1009,1,False,39,Low,B,0.828330,0.355773,663,-2366.812535,4880.767220


In [75]:
custom_to_csv(participantsJoined, "ParticipantsAugmented.csv")