In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

## Fit Iris Data using the first two features

In [None]:
df = pd.read_csv('https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv') 

In [None]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [None]:
def fit_tree(df):
  X = df.iloc[:, 0:2]
  y = df.variety
  X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,
                     random_state=1)
  

  model = DecisionTreeClassifier()
  model = model.fit(X_train, y_train)
  
  print('Test accuracy: %.3f' % model.score(X_test, y_test))

In [None]:
fit_tree(df)

Test accuracy: 0.633


## Replicate data 10 tiems, split randomly, and fit again

In [None]:
df_repc = df.iloc[np.arange(len(df)).repeat(10)]

In [None]:
df_repc.shape

(1500, 5)

In [None]:
fit_tree(df_repc)

Test accuracy: 0.907


### After replication, the out-of-sample accuracy increased from 0.633 to 0.907.

Why is that? As I mentioned in the class meeting, splitting the data replication will make many sample points in the test and in training the same. The test set has the information of the training set, which is called data leaking.
Let's test by replicating data after splitting:

In [None]:
X = df.iloc[:, 0:2]
y = df.variety
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20,
                                                    stratify=y,
                                                    random_state=1)

X_train = X_train.iloc[np.arange(len(X_train)).repeat(10)]
X_test = X_test.iloc[np.arange(len(X_test)).repeat(10)]
y_train = y_train.iloc[np.arange(len(y_train)).repeat(10)]
y_test = y_test.iloc[np.arange(len(y_test)).repeat(10)]

  
model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)
  
print('Test accuracy: %.3f' % model.score(X_test, y_test))

Test accuracy: 0.633


### You can see that the accuracy is the same. When data are replicated after splitting, as there is no data leaking, the performance doesn't change.