In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# **Data Collection**

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
HT_data = pd.read_csv(r'/content/drive/MyDrive/Key folders/EAU SC 2023 Project/human_trafficking.csv')

  HT_data = pd.read_csv(r'/content/drive/MyDrive/Key folders/EAU SC 2023 Project/human_trafficking.csv')


# **Data Cleaning**

In [4]:
HT_data.columns

Index(['yearOfRegistration', 'Datasource', 'gender', 'ageBroad',
       'majorityStatus', 'majorityStatusAtExploit', 'majorityEntry',
       'citizenship', 'meansOfControlDebtBondage',
       'meansOfControlTakesEarnings', 'meansOfControlRestrictsFinancialAccess',
       'meansOfControlThreats', 'meansOfControlPsychologicalAbuse',
       'meansOfControlPhysicalAbuse', 'meansOfControlSexualAbuse',
       'meansOfControlFalsePromises', 'meansOfControlPsychoactiveSubstances',
       'meansOfControlRestrictsMovement', 'meansOfControlRestrictsMedicalCare',
       'meansOfControlExcessiveWorkingHours', 'meansOfControlUsesChildren',
       'meansOfControlThreatOfLawEnforcement',
       'meansOfControlWithholdsNecessities',
       'meansOfControlWithholdsDocuments', 'meansOfControlOther',
       'meansOfControlNotSpecified', 'meansOfControlConcatenated',
       'isForcedLabour', 'isSexualExploit', 'isOtherExploit', 'isSexAndLabour',
       'isForcedMarriage', 'isForcedMilitary', 'isOrganRemova

In [5]:
HT_data.dtypes

yearOfRegistration                   int64
Datasource                          object
gender                              object
ageBroad                            object
majorityStatus                      object
                                     ...  
recruiterRelationIntimatePartner     int64
recruiterRelationFriend              int64
recruiterRelationFamily              int64
recruiterRelationOther               int64
recruiterRelationUnknown             int64
Length: 63, dtype: object

In [6]:
HT_data.head(5)

Unnamed: 0,yearOfRegistration,Datasource,gender,ageBroad,majorityStatus,majorityStatusAtExploit,majorityEntry,citizenship,meansOfControlDebtBondage,meansOfControlTakesEarnings,...,typeOfSexPrivateSexualServices,typeOfSexConcatenated,isAbduction,RecruiterRelationship,CountryOfExploitation,recruiterRelationIntimatePartner,recruiterRelationFriend,recruiterRelationFamily,recruiterRelationOther,recruiterRelationUnknown
0,2002,Case Management,Female,18--20,Adult,-99,-99,CO,-99,-99,...,-99,-99,-99,-99,-99,0,0,0,0,1
1,2002,Case Management,Female,18--20,Adult,-99,-99,CO,-99,-99,...,-99,-99,-99,-99,-99,0,0,0,0,1
2,2002,Case Management,Female,18--20,Adult,-99,-99,CO,-99,-99,...,-99,-99,-99,-99,-99,0,0,0,0,1
3,2002,Case Management,Female,18--20,Adult,-99,-99,CO,-99,-99,...,-99,-99,-99,-99,-99,0,0,0,0,1
4,2002,Case Management,Female,18--20,Adult,-99,-99,CO,-99,-99,...,-99,-99,-99,-99,-99,0,0,0,0,1


Most of the columns in the dataset are not useful for our task which is :

>**Taking an input of the person's age, country, gender, or any combination of the 3, and responding with how likely they are of being humanly trafficked, ie, what is the chance that a random person who has been humanly trafficked is from the same country.**

In [7]:
HT_y_is_missing = HT_data[
    (HT_data['typeOfExploitConcatenated'] == '-99') &
    (HT_data['isForcedLabour'] != -99) &
    (HT_data['isSexualExploit'] != -99) &
    (HT_data['isOtherExploit'] != -99) &
    (HT_data['isSexAndLabour'] != -99) &
    (HT_data['isForcedMarriage'] != -99) &
    (HT_data['isForcedMilitary'] != -99) &
    (HT_data['isOrganRemoval'] != -99) &
    (HT_data['isSlaveryAndPractices'] != -99)
    ]

HT_y_is_missing.shape

(17, 63)

In [8]:
HT_data = HT_data[['gender', 'ageBroad', 'citizenship', 'typeOfExploitConcatenated']]
HT_data.head(5)

Unnamed: 0,gender,ageBroad,citizenship,typeOfExploitConcatenated
0,Female,18--20,CO,Sexual exploitation
1,Female,18--20,CO,Sexual exploitation
2,Female,18--20,CO,Sexual exploitation
3,Female,18--20,CO,Sexual exploitation
4,Female,18--20,CO,Sexual exploitation


In [9]:
HT_data.shape

(48801, 4)

In [10]:
HT_data.typeOfExploitConcatenated.unique()

array(['Sexual exploitation', '-99', 'Forced labour', 'Other',
       'Forced labour;Other', 'Slavery and similar practices',
       'Forced marriage',
       'Forced labour;Sexual exploitation;Combined sexual and labour exploitation'],
      dtype=object)

In [11]:
HT_data.typeOfExploitConcatenated.value_counts()

-99                                                                          16174
Sexual exploitation                                                          15989
Forced labour                                                                 8969
Other                                                                         7063
Slavery and similar practices                                                  359
Forced marriage                                                                168
Forced labour;Sexual exploitation;Combined sexual and labour exploitation       78
Forced labour;Other                                                              1
Name: typeOfExploitConcatenated, dtype: int64

In [12]:
HT_data = HT_data[HT_data.typeOfExploitConcatenated != '-99']
HT_data.shape

(32627, 4)

In [13]:
HT_data.citizenship.unique()

array(['CO', 'MD', 'RO', 'UA', 'BY', 'GH', 'BG', 'ID', 'GW', 'KG', 'SN',
       'LA', 'KZ', 'HT', 'LK', 'MM', 'AF', 'UG', 'ER', 'NG', 'NP', 'PH',
       'KH', 'UZ', '00', 'US', 'TH', 'KR', '-99', 'VN', 'KE', 'CN', 'MX'],
      dtype=object)

In [14]:
HT_data = HT_data[HT_data.citizenship != '00']
HT_data = HT_data[HT_data.citizenship != '-99']
HT_data.shape

(24138, 4)

In [15]:
HT_data.ageBroad.unique()

array(['18--20', '21--23', '24--26', '27--29', '30--38', '9--17', '-99',
       '0--8', '39--47', '48+'], dtype=object)

In [16]:
HT_data = HT_data[HT_data.ageBroad != '-99']
HT_data.shape

(13431, 4)

In [17]:
HT_data.sample(5)

Unnamed: 0,gender,ageBroad,citizenship,typeOfExploitConcatenated
11338,Female,30--38,LK,Forced labour
18079,Male,39--47,BY,Forced labour
18103,Male,39--47,KG,Forced labour
29809,Male,39--47,TH,Forced labour
40584,Female,24--26,US,Sexual exploitation


In [18]:
HT_data.ageBroad.value_counts()

30--38    3152
9--17     2397
21--23    1822
18--20    1754
24--26    1413
39--47    1213
27--29    1060
48+        402
0--8       218
Name: ageBroad, dtype: int64

In [43]:
HT_data.dtypes

gender                       int64
ageBroad                     int64
citizenship                  int64
typeOfExploitConcatenated    int64
dtype: object

# **Data Transformation**

Transforming all the data in the dataset to numbers because that's what computers understand.

In [19]:
def map_column_to_ints(column):
    values = column.unique()
    values = sorted(values)

    mapped_ints = list(range(len(values)))
    mapped_to = {k:mapped_ints[i] for i,k in enumerate(values)}
    return mapped_to

In [20]:
HT_data_useful = HT_data.copy()

In [21]:
countries_mapped = map_column_to_ints(HT_data_useful.citizenship)
HT_data_useful.citizenship = HT_data_useful.citizenship.map(countries_mapped)

In [22]:
age_groups_mapped = map_column_to_ints(HT_data_useful.ageBroad)
HT_data_useful.ageBroad = HT_data_useful.ageBroad.map(age_groups_mapped)

In [23]:
genders_mapped = map_column_to_ints(HT_data_useful.gender)
HT_data_useful.gender = HT_data_useful.gender.map(genders_mapped)

In [24]:
HT_data_useful.sample(5)

Unnamed: 0,gender,ageBroad,citizenship,typeOfExploitConcatenated
46036,0,3,28,Sexual exploitation
34155,0,2,28,Sexual exploitation
19136,0,3,28,Sexual exploitation
13630,0,5,9,Forced labour
14235,0,6,9,Forced labour


In [25]:
HT_data_useful.typeOfExploitConcatenated.unique()

array(['Sexual exploitation', 'Forced labour', 'Other',
       'Forced labour;Other', 'Forced marriage',
       'Forced labour;Sexual exploitation;Combined sexual and labour exploitation'],
      dtype=object)

In [26]:
exploits_mapped = map_column_to_ints(HT_data_useful.typeOfExploitConcatenated)
HT_data_useful.typeOfExploitConcatenated = HT_data_useful.typeOfExploitConcatenated.map(exploits_mapped)

In [27]:
HT_data_useful.sample(5)

Unnamed: 0,gender,ageBroad,citizenship,typeOfExploitConcatenated
13022,0,2,9,0
14927,1,3,18,0
42665,0,8,28,5
10885,1,8,8,0
29276,1,5,2,0


In [28]:
minimax = MinMaxScaler()
HT_data_useful = minimax.fit_transform(HT_data_useful)

In [29]:
HT_data_useful = pd.DataFrame(HT_data_useful, columns=['gender', 'ageBroad', 'citizenship', 'typeOfExploitConcatenated'])
HT_data_useful.sample(5)

Unnamed: 0,gender,ageBroad,citizenship,typeOfExploitConcatenated
11365,0.0,0.25,0.933333,1.0
1892,0.0,1.0,0.2,0.0
4416,0.0,0.75,0.866667,0.0
565,0.0,0.375,0.566667,1.0
2119,1.0,0.625,0.066667,0.0


In [30]:
x = HT_data_useful.drop('typeOfExploitConcatenated', axis=1)
y = HT_data_useful.typeOfExploitConcatenated

In [31]:
x_tr, x_tt, y_tr, y_tt = train_test_split(x, y, test_size=0.3, random_state=333)

In [32]:
reg_model = LinearRegression()
reg_model.fit(x_tr, y_tr)
reg_model.score(x_tt, y_tt)

0.5344867765661272

In [33]:
predictions = reg_model.predict(x_tt)
predictions

array([0.72210989, 0.7634175 , 0.92043353, ..., 0.49054234, 0.7634175 ,
       0.47540407])

In [34]:
R2 = r2_score(y_tt, predictions)
print("Linear Model's R Squared: %f" % (R2))

Linear Model's R Squared: 0.534487


In [35]:
forest_model = RandomForestRegressor(random_state=999)
forest_model.fit(x_tr, y_tr)
forest_model.score(x_tt, y_tt)

0.9536949702013735

In [36]:
predictions = forest_model.predict(x_tt)
predictions

array([0., 1., 1., ..., 0., 1., 1.])

In [37]:
R2 = r2_score(y_tt, predictions)
print("Random Forest Model's R Squared: %f" % (R2))

Random Forest Model's R Squared: 0.953695


In [53]:
def make_prediction(age, gender, country):
  newage = ''
  ages_lb = [0, 9, 18, 21, 24, 27, 30, 39, 48]
  ageGroups = HT_data.ageBroad.unique()
  for i,lb in enumerate(ages_lb):
    if age > lb:
      pass
    else:
      for ageGroup in ageGroups:
        print(str(ageGroup))
        if str(ageGroup).startswith(str(lb)):
          newage = ageGroup
          break
      break
  print(newage)


In [42]:
HT_data.sample()

Unnamed: 0,gender,ageBroad,citizenship,typeOfExploitConcatenated
4927,0,2,26,5


In [54]:
make_prediction(15, 'Male', 'Ghana')

1
2
3
4
5
8
0
6
7

