# Import data

In [17]:
import pandas as pd
historical_dataset = pd.read_csv("historical.csv")
historical_dataset.head()

Unnamed: 0,identifier,size,kitchens,bathrooms,floor,type,year,condition,elevator,subway,district,rooms,recentOwner,longitude,latitude,prediction,highValue
0,125858,3,2.0,7.0,3.0,2008.0,4,6,True,False,133828.0,3.0,True,-0.28785,-0.500141,True,True
1,129220,3,2.0,8.0,3.0,2001.0,4,6,True,False,73792.0,3.0,True,0.796448,-0.500141,True,False
2,148556,3,2.0,15.0,3.0,2009.0,4,6,True,False,43252.0,3.0,True,-2.860246,-0.500141,False,False
3,10272,3,2.0,20.0,3.0,2007.0,3,6,True,False,98975.0,2.0,True,-0.848421,-0.500141,True,True
4,8684,3,2.0,21.0,3.0,2008.0,4,6,True,False,66036.0,3.0,True,-0.223602,-0.500141,True,False


# Performance historical datset

In [18]:
# Count number of rows with prediction True
predicted_counts = historical_dataset['prediction'].value_counts()
n_contracts_rejected = predicted_counts[False]
n_contracts_accepted = predicted_counts[True]

# Filter rows with prediction True
contracts_accepted = historical_dataset[historical_dataset['prediction'] == True]
highValue_contracts = contracts_accepted[contracts_accepted['highValue'] == True]
lowValue_contracts = contracts_accepted[contracts_accepted['highValue'] == False]

money_out = 450 * len(contracts_accepted)
money_in = 100 * len(lowValue_contracts) + 600 * len(highValue_contracts)
total = money_in - money_out

print(f"Contracts accepted: {n_contracts_accepted} ({n_contracts_accepted/len(historical_dataset) * 100}%)")
print(f"Contracts rejected: {n_contracts_rejected} ({n_contracts_rejected/len(historical_dataset) * 100}%)")
print(f"High value contracts: {len(highValue_contracts)} ({len(highValue_contracts)/len(contracts_accepted) * 100}%)")
print(f"Low value contracts: {len(lowValue_contracts)} ({len(lowValue_contracts)/len(contracts_accepted) * 100}%)")
print(f"Balance: {total}")

Contracts accepted: 36382 (36.382%)
Contracts rejected: 63618 (63.617999999999995%)
High value contracts: 29649 (81.49359573415425%)
Low value contracts: 6733 (18.506404265845745%)
Balance: 2090800


In [19]:
n_high_values_without_contract = len(historical_dataset[(historical_dataset['highValue'] == True) & (historical_dataset['prediction'] == False)])
n_low_values_without_contract = len(historical_dataset[(historical_dataset['highValue'] == False) & (historical_dataset['prediction'] == False)])
n_high_total = len(historical_dataset[historical_dataset['highValue'] == True])
n_low_total = len(historical_dataset[historical_dataset['highValue'] == False])

print(f"No contract for high value: {n_high_values_without_contract} ({n_high_values_without_contract/n_high_total * 100}%)")
print(f"No contract for low value: {n_low_values_without_contract} ({n_low_values_without_contract/n_low_total * 100}%)")
best_case_total = (600-450) * n_high_total
print(f"Earnings best case scenario: {best_case_total} ({best_case_total/total * 100}%)")

No contract for high value: 18359 (38.241543076153974%)
No contract for low value: 45259 (87.04993075857824%)
Earnings best case scenario: 7201200 (344.42318729672854%)


In [11]:
# Information for the table
print(f"No Contract: {n_low_values_without_contract} (low) + {n_high_values_without_contract} (high) = {n_low_values_without_contract + n_high_values_without_contract}")
print(f"Contract: {len(lowValue_contracts)} (low) + {len(highValue_contracts)} (high) = {len(lowValue_contracts + highValue_contracts)}")
print(f"Total: {n_low_total} (low) + {n_high_total} (high) = {n_low_total + n_high_total}")

No Contract: 45259 (low) + 18359 (high) = 63618
Contract: 6733 (low) + 29649 (high) = 36382
Total: 51992 (low) + 48008 (high) = 100000


# Percentage of high value with contract

In [12]:
print(f"High value with contract: {len(highValue_contracts)} ({len(highValue_contracts)/n_high_total * 100}%)")
# Not higher because you want to lower your chances of getting a low value contract (find a balance)

High value with contract: 29649 (61.758456923846026%)


# Income current data (monopoly company)

In [20]:
import pandas as pd
current_dataset = pd.read_csv("current.csv")
current_dataset.head()

Unnamed: 0,identifier,size,kitchens,bathrooms,floor,type,year,condition,elevator,subway,district,rooms,recentOwner,longitude,latitude
0,19428,1,1.0,6.0,4.0,1986.0,4,4,True,False,69318.0,2.0,True,0.215669,-0.500141
1,137369,2,1.0,5.0,4.0,1962.0,3,2,True,False,72945.0,2.0,True,0.276088,-0.500141
2,33242,2,1.0,10.0,4.0,2007.0,3,6,True,False,60067.0,3.0,True,-1.377581,-0.500141
3,147325,1,1.0,19.0,3.0,2007.0,1,6,False,False,67339.0,1.0,False,0.7341,-0.500141
4,109748,2,1.0,5.0,4.0,1979.0,3,2,True,False,117634.0,2.0,True,0.328889,-0.500141


## Percentages
Let's make a guess for the incomes of the compony if they keep their monompoly based only on the percentages of high/low value contracts.

In [21]:
# Help function for calculating the income, will probably need it later
def calculate_income(high_value_contracts, low_value_contracts):
    return (600-450) * high_value_contracts + (100-450) * low_value_contracts

## Validate guess

In [22]:
# Split dataset into training and validation, to check how well our guess is
from sklearn.model_selection import train_test_split
train, test = train_test_split(historical_dataset, test_size=0.2)

p_low_contract = len(train[(train["highValue"] == False) & (train["prediction"] == True)]) / len(train)
p_high_contract = len(train[(train["highValue"] == True) & (train["prediction"] == True)]) / len(train)

n_low_guess = len(test) * p_low_contract
n_high_guess = len(test) * p_high_contract

n_low = len(test[(test["highValue"] == False) & (test["prediction"] == True)])
n_high = len(test[(test["highValue"] == True) & (test["prediction"] == True)])

# TODO: add percentages here how far the guess is from the actual data
print(f"Guessing {n_low_guess} low value contracts, {n_high_guess} high value contracts, so an income of {calculate_income(n_high_guess, n_low_guess)}")
print(f"Actually {n_low} low value contracts, {n_high} high value contracts, so an income of {calculate_income(n_high, n_low)}")
print(f"Diff low: {n_low - n_low_guess} ({(n_low - n_low_guess)/n_low * 100}%)")
print(f"Diff high: {n_high - n_high_guess} ({(n_high - n_high_guess)/n_high * 100}%)")
print(f"Diff income: {calculate_income(n_high, n_low) - calculate_income(n_high_guess, n_low_guess)} ({(calculate_income(n_high, n_low) - calculate_income(n_high_guess, n_low_guess))/calculate_income(n_high, n_low) * 100}%)")

Guessing 1350.0 low value contracts, 5927.750000000001 high value contracts, so an income of 416662.5000000001
Actually 1333 low value contracts, 5938 high value contracts, so an income of 424150
Diff low: -17.0 (-1.275318829707427%)
Diff high: 10.24999999999909 (0.1726170427753299%)
Diff income: 7487.499999999884 (1.765295296475276%)


## Actual guess

In [23]:
p_low_contract = len(lowValue_contracts) / len(historical_dataset)
p_high_contract = len(highValue_contracts) / len(historical_dataset)
n_current = len(current_dataset)

n_low = n_current * p_low_contract
n_high = n_current * p_high_contract
print(f"Guessing {n_low} low value contracts, {n_high} high value contracts, so an income of {calculate_income(n_high, n_low)}")

Guessing 1346.6 low value contracts, 5929.799999999999 high value contracts, so an income of 418159.99999999994


## Model
Let's now try to build an actual model that aims to predict whether the company takes the contract or not, since the previous method only gives an estimation of the income and doesn't acutally predict which contracts are taken.

This is the data we currently have:

In [24]:
print("Train and Test datasets")
train.head()

Train and Test datasets


Unnamed: 0,identifier,size,kitchens,bathrooms,floor,type,year,condition,elevator,subway,district,rooms,recentOwner,longitude,latitude,prediction,highValue
72978,26126,2,1.0,6.0,4.0,2001.0,3,4,True,False,40934.0,2.0,False,1.681468,-0.500141,False,False
64529,128080,2,1.0,6.0,4.0,1983.0,3,2,True,False,92624.0,2.0,True,0.445151,-0.500141,True,True
82369,75486,1,1.0,22.0,1.0,2009.0,4,6,True,False,45099.0,2.0,False,-0.647432,-0.500141,False,True
41328,144814,3,2.0,18.0,4.0,2012.0,4,6,False,False,52370.0,3.0,False,-2.419522,-0.500141,False,False
7117,82910,3,1.0,13.0,4.0,2014.0,1,6,False,False,52646.0,3.0,False,-1.581232,-0.499704,False,False


In [25]:
print("Current dataset")
current_dataset.head()

Current dataset


Unnamed: 0,identifier,size,kitchens,bathrooms,floor,type,year,condition,elevator,subway,district,rooms,recentOwner,longitude,latitude
0,19428,1,1.0,6.0,4.0,1986.0,4,4,True,False,69318.0,2.0,True,0.215669,-0.500141
1,137369,2,1.0,5.0,4.0,1962.0,3,2,True,False,72945.0,2.0,True,0.276088,-0.500141
2,33242,2,1.0,10.0,4.0,2007.0,3,6,True,False,60067.0,3.0,True,-1.377581,-0.500141
3,147325,1,1.0,19.0,3.0,2007.0,1,6,False,False,67339.0,1.0,False,0.7341,-0.500141
4,109748,2,1.0,5.0,4.0,1979.0,3,2,True,False,117634.0,2.0,True,0.328889,-0.500141


### Data

In [26]:
train_data = train.drop(['prediction', 'highValue'], axis=1)
assert train_data.columns.to_list() == current_dataset.columns.to_list()
train_target = train["prediction"]

test_data = test.drop(['prediction', 'highValue'], axis=1)
assert test_data.columns.to_list() == current_dataset.columns.to_list()
test_target = test["prediction"]

### RandomForestClassifier

In [27]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_data, train_target)

RandomForestClassifier(max_depth=2, random_state=0)

In [28]:
clf.score(test_data, test_target)

0.95245

### LinearSVC

In [22]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(train_data, train_target)



LinearSVC()

In [23]:
clf.score(test_data, test_target)

0.87515

### SGDClassifier

In [24]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(train_data, train_target)

SGDClassifier()

In [18]:
clf.score(test_data, test_target)

0.9508

### Hyperparameter tuning
RandomForestClassifier clearly is the best, so lets try to find the best hyperparameters for it. (since finding optimal parameters for each of the models will take too long)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rfc=RandomForestClassifier(random_state=0)
param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
} # https://www.kaggle.com/code/sociopath00/random-forest-using-gridsearchcv/notebook
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, verbose=10)
CV_rfc.fit(train_data, train_target)
CV_rfc.best_params_

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5; 1/60] START criterion=gini, max_depth=4, max_features=auto, n_estimators=200
[CV 1/5; 1/60] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=0.966 total time=   4.8s
[CV 2/5; 1/60] START criterion=gini, max_depth=4, max_features=auto, n_estimators=200
[CV 2/5; 1/60] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=0.970 total time=   4.7s
[CV 3/5; 1/60] START criterion=gini, max_depth=4, max_features=auto, n_estimators=200
[CV 3/5; 1/60] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=0.970 total time=   4.7s
[CV 4/5; 1/60] START criterion=gini, max_depth=4, max_features=auto, n_estimators=200
[CV 4/5; 1/60] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=0.968 total time=   4.6s
[CV 5/5; 1/60] START criterion=gini, max_depth=4, max_features=auto, n_estimators=200
[CV 5/5; 1/60] END criterion=gini, max_d

This gives the optimal parameters (with 98% accuracy), but takes quite some time to run. So we'll just use the default parameters in the interest of time.

### Some used references
https://datascience.stackexchange.com/questions/57863/why-my-svm-svc-fit-linear-kernal-run-so-long-time
https://stackoverflow.com/questions/34251980/a-progress-bar-for-scikit-learn
https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
## Profits
In order to estimate the profits, we also need a model that predicts whether a contract is high or low value, so let's also use a RandomForestClassifier.

In [29]:
train_target_value = train["highValue"]
test_target_value = test["highValue"]

In [30]:
from sklearn.ensemble import RandomForestClassifier
clf_value = RandomForestClassifier(max_depth=2, random_state=0)
clf_value.fit(train_data, train_target_value)

RandomForestClassifier(max_depth=2, random_state=0)

Also here we can find the optimal hyperparameters by training different models and running grid search on each of them to find the optimal values, but I'll skip this in the interest of time and just use the default values.
Let's now predict for each of them whether it's high value or low value.

In [31]:
is_high_value = clf_value.predict(current_dataset)
takes_contract = clf.predict(current_dataset)

In [32]:
def calculate_income_high_contract(high_vector, contract_vector):
    dataset = pd.DataFrame({'highValue': high_vector, 'predictions': contract_vector}, columns=['highValue', 'predictions'])
    highValue_contracts = dataset[(dataset['highValue'] == True) & (dataset['predictions'] == True)]
    lowValue_contracts = dataset[(dataset['highValue'] == False) & (dataset['predictions'] == True)]
    print("High value contracts: ", len(highValue_contracts))
    print("Low value contracts: ", len(lowValue_contracts))
    return calculate_income(len(highValue_contracts), len(lowValue_contracts))

In [33]:
calculate_income_high_contract(is_high_value, takes_contract)

High value contracts:  6765
Low value contracts:  0


1014750

It's still strange that we only get 1 low value contract, so further hyperparameter tuning might be needed, but there is currently no time for that.

# Contracts to take
In order to know which contracts to take, we need to get all highValue houses, that the other company doesn't take. We can just use the same models as before for this.

In [36]:
from numpy import savetxt
dataset = pd.DataFrame({'identifier': current_dataset['identifier'],'highValue': is_high_value, 'predictions': takes_contract}, columns=['identifier', 'highValue', 'predictions'])
contracts_to_take = dataset[(dataset['highValue'] == True)]
ids = contracts_to_take["identifier"]
savetxt("selection.csv", ids, delimiter=",", fmt='%s')

# Expected Earnings

In [35]:
shared_contracts = contracts_to_take[(contracts_to_take['predictions'] == True)]
non_shared_contracts = contracts_to_take[(contracts_to_take['predictions'] == False)]
(600-450) * len(shared_contracts) + (600-450)/2 * len(non_shared_contracts)

1239150.0