In [120]:
import pandas as pd
import numpy as np

profiles = pd.read_csv('profiles.csv')

## Investigate data

In [121]:
print(profiles.columns)
print(profiles.info())
print(profiles.describe())
print(profiles.head())


Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   body_type    54650 non-null  object 
 2   diet         35551 non-null  object 
 3   drinks       56961 non-null  object 
 4   drugs        45866 non-null  object 
 5   education    53318 non-null  object 
 6   essay0       54458 non-null  object 
 7   essay1       52374 non-null  object 
 8   essay2       50308 non-null  object 
 9   essay3       48470 non-null  object 
 10  essa

## Prepare data

### Clean starsign field

In [122]:
profiles['signs_one_word'] = profiles['sign'].str.split().str.get(0)

### 

### Convert ordered categorical fields into numbers (low number = lower score)

In [123]:
body_type_map = {"rather not say": 0, "skinnny": 1, "thin":2, "athletic":3, "fit":4, "average": 5, "jacked": 6, "full figured": 7, "curvy":8, "a little extra": 9, "overweight": 10, "used up": 11}
profiles["body_type_code"] = profiles.body_type.map(body_type_map)

drinks_map = {"rather not say": 0, "not at all": 1, "rarely": 2, "socially": 3, "often": 4, "very often": 5, "desperately": 6}
profiles["drinks_code"] = profiles.drinks.map(drinks_map)

smokes_map = {"rather not say": 0, "no": 1, "sometimes": 2, "when drinking": 3, "trying to quit": 4, "yes": 5}
profiles["smokes_code"] = profiles.smokes.map(smokes_map)

drugs_map = {"rather not say": 0, "never": 1, "sometimes": 0, "often": 0}
profiles["drugs_code"] = profiles.drugs.map(drugs_map)

sex_map = {"rather not say": 0, "m": 1, "f": 2}
profiles["sex_code"] = profiles.sex.map(sex_map)

status_map = {"rather not say": 0, "single": 1, "seeing someone": 2, "available": 1, "married": 2, "unkown": 0}
profiles["status_code"] = profiles.status.map(status_map)



### Replace null values

In [124]:

profiles.fillna({'body_type':0,
           'diet':0,
           'drinks':0,
           'drugs':0,
           'education':0,
           'ethnicity':0,
           'height':0,
           'job':0,
           'offspring':0,
           'pets':0,
           'religion':0,
           'sign':0,
           'smokes':0,
           'smokes_code':0,
           'drugs_code':0,
           'body_type_code':0,
           'drinks_code':0,
           'status_code':0,
           'speaks':""}, inplace = True)

# Check if there are still any null values
profiles.isna().any()

age               False
body_type         False
diet              False
drinks            False
drugs             False
education         False
essay0             True
essay1             True
essay2             True
essay3             True
essay4             True
essay5             True
essay6             True
essay7             True
essay8             True
essay9             True
ethnicity         False
height            False
income            False
job               False
last_online       False
location          False
offspring         False
orientation       False
pets              False
religion          False
sex               False
sign              False
smokes            False
speaks            False
status            False
signs_one_word     True
body_type_code    False
drinks_code       False
smokes_code       False
drugs_code        False
sex_code          False
status_code       False
dtype: bool

## Predict relationship between all variables and drug use

Smoking and drinking influence drug use more than other variables.

Sex, age and status have no impact

In [125]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


variables = ['smokes_code', 'drinks_code', 'body_type_code', 'status_code', 'height', 'sex_code', 'height', 'age']
for var in variables:
    X = profiles[[var]]
    y = profiles['drugs_code']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    print(f"Score for {var}:", model.score(X_test, y_test))








Score for smokes_code: 0.6346955796497081
Score for drinks_code: 0.6271893244370309
Score for body_type_code: 0.6226021684737281
Score for status_code: 0.6226021684737281


Score for height: 0.6230191826522101
Score for sex_code: 0.6226021684737281
Score for height: 0.6230191826522101
Score for age: 0.6226021684737281
