In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [3]:
import pandas as pd

In [4]:
classifiers = [DecisionTreeClassifier(),
              RandomForestClassifier(), 
               AdaBoostClassifier(), 
               GradientBoostingClassifier()]
log_cols = ['Classifier','Accuracy']
log = pd.DataFrame(columns=log_cols)

In [5]:
samples = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

In [6]:
samples

StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.1,
            train_size=None)

In [7]:
train = pd.read_csv('data/train_final.csv')
test = pd.read_csv('data/test_final.csv')

In [8]:
train.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone,Title
0,0,1,1,1,1,0,1.0,1,1,1
1,1,1,1,0,3,4,1.0,2,0,3
2,2,1,3,0,2,1,0.0,1,1,2
3,3,1,1,0,3,4,0.0,2,0,3
4,4,0,3,1,3,1,0.0,1,1,1


In [9]:
test.head()

Unnamed: 0.1,Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Title,IsAlone
0,0,3,1,3,0.0,2,1,1,1
1,1,3,0,4,0.0,0,2,3,0
2,2,2,1,4,1.0,2,1,1,1
3,3,3,1,2,1.0,0,1,1,1
4,4,3,0,1,2.0,0,3,3,0


In [10]:
train.shape

(891, 10)

In [11]:
train.columns

Index(['Unnamed: 0', 'Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
       'FamilySize', 'IsAlone', 'Title'],
      dtype='object')

In [12]:
X = train.drop(['Survived'],axis=1)
y = train['Survived']

In [13]:
X.shape, y.shape

((891, 9), (891,))

In [14]:
for train_index, test_index in samples.split(X,y):
    print(train_index)
    print(test_index)

[349 134 487 519 225  30 129 580 103 649 128 137 209 177 809 542 404 632
 130 475 254 385 266 290 686 119 300 285 464 131 843 703 456 255 234 341
 430 444 662  66 124 447 698 605 116 844 299 702 102 614 775  27  58 427
 410  77 123 830  18 238 529 685 826 164 799 690 885 839   2 438 591 226
 638 409 556 120 126 125 537 567  86 327 545 755 787  13 472 560 797 823
 170 135 117 288 305 503 233 147 767 435 273 717 619 513 520 523 432 471
 247 152 530 563 550 516 399  89 804 198 802 596 240 262  35 581 814  12
 509  67  29 515 590 757 205 776 594 588 807 150 283 714 665 571 627 319
 308 848 424  65 652 180 357  81 148 522 312 762 364 185 406   6 190 750
 352  47 634 201  59 369 883 538 793  41 525 463 880 602 859 849 189 257
 536 239 159 695 114 264 697 479 315 309 527 615 524 484 803 656 606 789
 366  56 488 166 429 636 663 470  95 473 667 461 269 163 688 806 608 242
 815 733 783 338 505  99 584 650 365 756 351  17 681 310 600 379 732 684
 820 183 860 498 771 701 821 715 219 546 502 630 19

In [16]:
train_index.shape, test_index.shape

((801,), (90,))

In [25]:
del train_index
del test_index

In [20]:
pd.DataFrame(train).isnull().count(), pd.DataFrame(test).isnull().count()

(Unnamed: 0    891
 Survived      891
 Pclass        891
 Sex           891
 Age           891
 Fare          891
 Embarked      891
 FamilySize    891
 IsAlone       891
 Title         891
 dtype: int64, Unnamed: 0    418
 Pclass        418
 Sex           418
 Age           418
 Fare          418
 Embarked      418
 FamilySize    418
 Title         418
 IsAlone       418
 dtype: int64)

In [27]:
for train_index, test_index in samples.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    print(X_train.shape, X_test.shape)
    print(y_train.shape, y_test.shape)

KeyError: '[349 134 487 519 225  30 129 580 103 649 128 137 209 177 809 542 404 632\n 130 475 254 385 266 290 686 119 300 285 464 131 843 703 456 255 234 341\n 430 444 662  66 124 447 698 605 116 844 299 702 102 614 775  27  58 427\n 410  77 123 830  18 238 529 685 826 164 799 690 885 839   2 438 591 226\n 638 409 556 120 126 125 537 567  86 327 545 755 787  13 472 560 797 823\n 170 135 117 288 305 503 233 147 767 435 273 717 619 513 520 523 432 471\n 247 152 530 563 550 516 399  89 804 198 802 596 240 262  35 581 814  12\n 509  67  29 515 590 757 205 776 594 588 807 150 283 714 665 571 627 319\n 308 848 424  65 652 180 357  81 148 522 312 762 364 185 406   6 190 750\n 352  47 634 201  59 369 883 538 793  41 525 463 880 602 859 849 189 257\n 536 239 159 695 114 264 697 479 315 309 527 615 524 484 803 656 606 789\n 366  56 488 166 429 636 663 470  95 473 667 461 269 163 688 806 608 242\n 815 733 783 338 505  99 584 650 365 756 351  17 681 310 600 379 732 684\n 820 183 860 498 771 701 821 715 219 546 502 630 199 693 873 232   1  19\n 575 554 621  57 613  82  49 382 604 249 453 570  40 151 528 393 455 263\n 812 286 306 673 386  50 585 436 862  11 654 441 758 465 817 106  62 317\n 850 845 331 462 612 744 670 346 203 101 837 371 448 739 497 816 113  24\n 167 819 532 772  61 127 766 858 562 350 329 110 539 564 105 454 707 478\n 531 841 274 710 354 268 659  74 215 765 752 521 179 857 805 380 623 138\n 426 889 449 553 348 321 200  20 334 624 867 692 796 111 810 356 811  39\n 699  37 298 890  22  31  43 691 625 140 779 610 378 370 422 153 367 651\n 565 387 832  23  96 265 589 342 451 139 616 303 786  16 174 548 886  15\n 173 734 194 252 853 372 413   5 738 358 877   8 742 156 373 682 256 316\n 227 425 218 836 210 660 326 543 343 887 568 743 221  69 467 719  21 499\n  80 555 773 360 281 881 347 388 648 782 375 716 376 277 769 184  72 728\n 828 469 204 408 253 322 726 398 876 446 593 243 325 212 852 617  91 653\n 633 753  92 480 639 780 333 149 763  28  84 501 214 122 865 547 313 647\n 459 791   0 162 729  63 874 781 235 540 496 720 175 208 551   4 718 136\n 434 582 482 706 512 785  45 642 784 318 477 708 222 397 328 390 722 700\n 759 259  88 494 607 421 861 559  85 241 304 500 244 324 611 704 485 671\n 433 287 888 236 457 157 552 414 301 696 272 431 601 402 423 856 230 535\n 644 246 359 192 583 740 834  70 829 526 678 609 458  73 295 213 133 518\n  64 689 374  55  93 291 420 725 490 121 549   9 777 731 741 468 587 452\n 311 362 144 237 340 747 855 669 146 822 108 631 450 245 216 335 155 320\n 626  60  38 723 770 599 411 207 798 573 847  94 363 171  68  76 760 332\n 679 884 276 270 384  78 104 840 416 510 181 749 248 282  83 289 788 418\n 569 400 292 635 827 871 872 730 646 661  46 158 160 712 460 489 658 705\n 761 517 578 280 381 132 250 764 687 870 576 869 344 748  25 813  34 875\n 439 514  79 795  52 745 754 278 172 251 674 713 664 595 261 293 186 579\n 231 668 345 115  36 574 620 868 677 314 193  10  14  75  48 736 195 228\n 109 481 412 629 107 307 154 141 491 561 863  32 297  90 879 557 220 628\n 825 339 778 182 878 492 391 735  97 835 187  98 323 774  42 442 711 217\n 165 294 882  53 466 504 866 476 260 655 302 508 724 534 407 544 395 746\n 419 694 603 112 296 337 666 178 833 118 622 415 403 202  71 197 507 474\n 191  33 206 794 161   7 598 846 533] not in index'

In [21]:
from sklearn.linear_model import LogisticRegression

In [23]:
log_reg = LogisticRegression()
log_reg.fit(X,y)
y_pred = log_reg(test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [24]:
X = X.drop(['Unnamed: 0'],axis=1)

In [25]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone',
       'Title'],
      dtype='object')

In [26]:
test.columns

Index(['Unnamed: 0', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize',
       'Title', 'IsAlone'],
      dtype='object')

In [27]:
test = test.drop(['Unnamed: 0'], axis=1)

In [28]:
log_reg = LogisticRegression()
log_reg.fit(X,y)
y_pred = log_reg(test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [29]:
test.isnull().sum()

Pclass        0
Sex           0
Age           0
Fare          1
Embarked      0
FamilySize    0
Title         0
IsAlone       0
dtype: int64

In [30]:
test.Fare.isnull().values.any()

True

In [31]:
test[test['Fare'].isnull()]

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Title,IsAlone
152,3,1,4,,0,1,1,1


In [32]:
test['Fare'].iloc[152] 

nan

In [33]:
avg = test['Fare'].mean()

In [34]:
test['Fare'].iloc[152] = 2
test['Fare'].iloc[152] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


2.0

In [35]:
test.Fare.astype(int)

0      0
1      0
2      1
3      1
4      2
5      1
6      0
7      3
8      0
9      3
10     1
11     3
12     4
13     3
14     4
15     3
16     2
17     0
18     1
19     0
20     4
21     0
22     3
23     4
24     4
25     2
26     4
27     0
28     3
29     3
      ..
388    0
389    2
390    4
391    3
392    2
393    1
394    3
395    4
396    0
397    4
398    0
399    0
400    4
401    2
402    4
403    4
404    3
405    2
406    1
407    4
408    0
409    2
410    0
411    4
412    0
413    1
414    4
415    0
416    1
417    3
Name: Fare, Length: 418, dtype: int32

In [36]:
X.isnull().any()

Pclass        False
Sex           False
Age           False
Fare          False
Embarked       True
FamilySize    False
IsAlone       False
Title         False
dtype: bool

In [37]:
X[X['Embarked'].isnull()]

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone,Title
61,1,0,3,4,,1,1,2
829,1,0,4,4,,1,1,3


In [38]:
X['Embarked'].median()

0.0

In [39]:
X['Embarked'].iloc[61] = 0
X['Embarked'].iloc[829] = 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [40]:
X.isnull().sum()

Pclass        0
Sex           0
Age           0
Fare          0
Embarked      0
FamilySize    0
IsAlone       0
Title         0
dtype: int64

In [41]:
y.isnull().sum()

0

In [42]:
test.isnull().sum()

Pclass        0
Sex           0
Age           0
Fare          0
Embarked      0
FamilySize    0
Title         0
IsAlone       0
dtype: int64

In [44]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone,Title
0,1,1,1,0,1.0,1,1,1
1,1,0,3,4,1.0,2,0,3
2,3,0,2,1,0.0,1,1,2
3,1,0,3,4,0.0,2,0,3
4,3,1,3,1,0.0,1,1,1


In [45]:
y.head()

0    1
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [46]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Title,IsAlone
0,3,1,3,0.0,2,1,1,1
1,3,0,4,0.0,0,2,3,0
2,2,1,4,1.0,2,1,1,1
3,3,1,2,1.0,0,1,1,1
4,3,0,1,2.0,0,3,3,0


In [48]:
X.to_csv("X_data.csv")
y.to_csv("y_data.csv")
test.to_csv("test.csv")

In [49]:
log_reg = LogisticRegression()
log_reg.fit(X,y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
log_reg.params()

AttributeError: 'LogisticRegression' object has no attribute 'params'

In [51]:
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
y_pred = log_reg.predict(test)

In [53]:
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,

In [54]:
count = 0
total = 0
for i in y_pred:
    if i == 0:
        count += 1
    total += 1

print("class: 1 ==>", total-count)
print("class: 0 ==>", count)

class: 1 ==> 144
class: 0 ==> 274


In [55]:
y_pred[1]

0

In [56]:
samples

StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.1,
            train_size=None)

In [58]:
test_index

array([158, 631, 376,  81, 856, 676, 763,   7, 757, 336, 203, 724, 588,
       536, 154, 587, 493, 596, 348, 530, 360, 113, 644, 378, 511, 422,
       130, 865, 597, 458,  45, 455, 773, 358, 220, 521, 568, 398, 787,
       603, 678, 311,  52, 768, 507, 264, 795, 850, 303, 646, 284, 633,
       722,  31, 269, 241,   2, 718, 533, 882, 669, 593, 263, 392, 401,
       178,  94, 571, 565,  49, 495, 670, 737, 617, 859, 249, 202, 611,
       561, 756, 599, 471, 374, 429, 119, 464,  75, 519, 393,  77],
      dtype=int64)