# Load the male users--including phony, names, passwords, etc.--to the AWS db Users table

## Import dependencies and load survey data from S3 bucket

In [2]:
import pandas as pd

In [2]:
df = pd.read_csv('https://aws-matchmaker-bucket.s3-us-west-2.amazonaws.com/SpeedDatingPreprocessed.csv')

In [3]:
df.head()

Unnamed: 0,couple_id,f_age,f_imprace,f_attr,f_sinc,f_intel,f_fun,f_amb,f_race,f_intrace,...,m_imprace,m_attr,m_sinc,m_intel,m_fun,m_amb,m_race,samerace,match,m_intrace
0,f1-m11,21.0,2.0,6.0,9.0,7.0,7.0,6.0,4.0,0.0,...,7.0,6.0,8.0,8.0,8.0,8.0,2.0,0,0,0.0
1,f1-m12,21.0,2.0,7.0,8.0,7.0,8.0,5.0,4.0,0.0,...,1.0,7.0,8.0,10.0,7.0,7.0,2.0,0,0,0.0
2,f1-m13,21.0,2.0,5.0,8.0,9.0,8.0,5.0,4.0,2.0,...,3.0,10.0,10.0,10.0,10.0,10.0,4.0,1,1,3.0
3,f1-m14,21.0,2.0,7.0,6.0,8.0,7.0,6.0,4.0,0.0,...,1.0,7.0,8.0,9.0,8.0,9.0,2.0,0,1,0.0
4,f1-m15,21.0,2.0,5.0,6.0,7.0,7.0,6.0,4.0,0.0,...,3.0,8.0,7.0,9.0,6.0,9.0,3.0,0,1,0.0


## Transform data

Convert couple_id back to iid

In [4]:
df['m_iid'] = df['couple_id'].str.split('-').apply(lambda x: x[1]).str.replace('m','')

Use the mean of partner ratings as attributes.

In [5]:
df2 = df[['m_iid','m_age','m_imprace','f_attr','f_sinc','f_intel','f_fun','f_amb','m_race']]
df3 = df2.groupby(['m_iid','m_age','m_imprace','m_race']).agg(['mean']).reset_index()
df3.columns = df3.columns.get_level_values(0)
df3['gender']=1
df3.head()

Unnamed: 0,m_iid,m_age,m_imprace,m_race,f_attr,f_sinc,f_intel,f_fun,f_amb,gender
0,100,30.0,8.0,6.0,5.0,7.866667,8.066667,6.066667,7.6,1
1,101,22.0,3.0,4.0,4.666667,9.0,8.666667,6.0,6.333333,1
2,102,32.0,2.0,2.0,4.375,7.125,7.1875,5.0,6.25,1
3,103,26.0,8.0,2.0,6.3125,7.0625,6.9375,6.4375,7.0,1
4,104,27.0,6.0,2.0,7.25,7.1875,7.625,6.9375,7.0625,1


In [6]:
df3.rename(columns={'m_iid':'iid',
                           'm_age':'age',
                           'm_imprace':'imprace',
                           'f_attr':'attr',
                           'f_sinc':'sinc',
                           'f_intel':'intel',
                           'f_fun':'fun',
                           'f_amb':'amb',
                           'm_race':'race'},
                           inplace=True)

Import the phony user data and merge it with the survey data.

In [7]:
df4 = pd.read_csv('male_userdata.csv')
df4.dropna(inplace=True)

In [8]:
df5 = pd.merge(df3,df4,how='inner',left_index=True,right_index=True)
df5.head()

Unnamed: 0,iid,age_x,imprace,race,attr,sinc,intel,fun,amb,gender_x,firstname,lastname,password,screenname,gender_y,age_y,email,photo,region
0,100,30.0,8.0,6.0,5.0,7.866667,8.066667,6.066667,7.6,1,Luka,Nišić,Nišić92@*,Luka898,male,27,lukanišić@example.com,https://uinames.com/api/photos/male/8.jpg,Bosnia and Herzegovina
1,101,22.0,3.0,4.0,4.666667,9.0,8.666667,6.0,6.333333,1,Ráduly,Lázár,Lázár91{,Ráduly252,male,28,ráduly91@example.com,https://uinames.com/api/photos/male/2.jpg,Hungary
2,102,32.0,2.0,2.0,4.375,7.125,7.1875,5.0,6.25,1,Ηγησίας,Γιάγκος,Γιάγκος87$#,Ηγησίας548,male,32,Ηγησίας-87@example.com,https://uinames.com/api/photos/male/12.jpg,Greece
3,103,26.0,8.0,2.0,6.3125,7.0625,6.9375,6.4375,7.0,1,Λεωνίδας,Ταρσούλη,Ταρσούλη97_%,Λεωνίδας211,male,22,Λεωνίδας97@example.com,https://uinames.com/api/photos/male/20.jpg,Greece
4,104,27.0,6.0,2.0,7.25,7.1875,7.625,6.9375,7.0625,1,Joshua,Stone,Stone90^=,Joshua511,male,29,joshuastone@example.com,https://uinames.com/api/photos/male/19.jpg,United States


In [9]:
df5.drop(columns=['gender_y','age_y','region'], inplace=True)

In [10]:
df5.rename(columns={'age_x':'age','gender_x':'gender'},
                           inplace=True)

In [11]:
df5.head()

Unnamed: 0,iid,age,imprace,race,attr,sinc,intel,fun,amb,gender,firstname,lastname,password,screenname,email,photo
0,100,30.0,8.0,6.0,5.0,7.866667,8.066667,6.066667,7.6,1,Luka,Nišić,Nišić92@*,Luka898,lukanišić@example.com,https://uinames.com/api/photos/male/8.jpg
1,101,22.0,3.0,4.0,4.666667,9.0,8.666667,6.0,6.333333,1,Ráduly,Lázár,Lázár91{,Ráduly252,ráduly91@example.com,https://uinames.com/api/photos/male/2.jpg
2,102,32.0,2.0,2.0,4.375,7.125,7.1875,5.0,6.25,1,Ηγησίας,Γιάγκος,Γιάγκος87$#,Ηγησίας548,Ηγησίας-87@example.com,https://uinames.com/api/photos/male/12.jpg
3,103,26.0,8.0,2.0,6.3125,7.0625,6.9375,6.4375,7.0,1,Λεωνίδας,Ταρσούλη,Ταρσούλη97_%,Λεωνίδας211,Λεωνίδας97@example.com,https://uinames.com/api/photos/male/20.jpg
4,104,27.0,6.0,2.0,7.25,7.1875,7.625,6.9375,7.0625,1,Joshua,Stone,Stone90^=,Joshua511,joshuastone@example.com,https://uinames.com/api/photos/male/19.jpg


Load to the AWS RDS db Users table.

In [12]:
from sqlalchemy import create_engine
import config
engine = create_engine(f"postgresql+psycopg2://{config.pguser}:{config.pgpassword}@{config.pghost}:{config.pgport}/{config.pgdatabase}")

In [14]:
df5.to_sql('Users', engine, if_exists='append',index=False)