Skip to content

Commit c8adb34

Browse files
author
Algorithmica
authored
Add files via upload
1 parent 0bc4d53 commit c8adb34

File tree

3 files changed

+161
-0
lines changed

3 files changed

+161
-0
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pandas as pd
2+
import os
3+
from sklearn import tree, ensemble, model_selection, preprocessing, decomposition, manifold, feature_selection, svm
4+
import seaborn as sns
5+
import numpy as np
6+
7+
import sys
8+
sys.path.append("E:/New Folder/utils")
9+
10+
import classification_utils as cutils
11+
12+
dir = 'C:/Users/Algorithmica/Downloads/dont-overfit-ii'
13+
train = pd.read_csv(os.path.join(dir, 'train.csv'))
14+
print(train.info())
15+
print(train.columns)
16+
17+
#filter unique value features
18+
train1 = train.iloc[:,2:]
19+
y = train['target'].astype(int)
20+
21+
#filter zero-variance features
22+
variance = feature_selection.VarianceThreshold()
23+
train2 = variance.fit_transform(train1)
24+
25+
lpca = decomposition.PCA(n_components=0.95)
26+
lpca.fit(train2)
27+
np.cumsum(lpca.explained_variance_ratio_)
28+
train_pca = lpca.transform(train2)
29+
30+
tsne = manifold.TSNE(n_components=3)
31+
train_tsne = tsne.fit_transform(train_pca)
32+
cutils.plot_data_3d_classification(train_tsne, y)
33+
34+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train_pca, y, test_size=0.1, random_state=1)
35+
36+
sns.countplot(x='target',data=train)
37+
38+
kernel_svm_estimator = svm.SVC(kernel='rbf')
39+
kernel_svm_grid = {'gamma':[0.01, 0.1, 1, 2, 5, 10], 'C':[0.001, 0.01, 0.1, 0.5] }
40+
final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train)
41+
42+
print(final_estimator.score(X_eval, y_eval))
43+
44+
test = pd.read_csv(os.path.join(dir, 'test.csv'))
45+
print(test.info())
46+
print(test.columns)
47+
48+
test1 = test.iloc[:,1:]
49+
test2 = variance.transform(test1)
50+
test_pca = lpca.transform(test2)
51+
test['target'] = final_estimator.predict(test_pca)
52+
test.to_csv(os.path.join(dir, 'submission.csv'), columns=['id', 'target'], index=False)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import pandas as pd
2+
import os
3+
from sklearn import tree, ensemble, model_selection, preprocessing, decomposition, manifold, feature_selection, svm
4+
import seaborn as sns
5+
import numpy as np
6+
7+
import sys
8+
sys.path.append("E:/New Folder/utils")
9+
10+
import classification_utils as cutils
11+
import common_utils as utils
12+
13+
dir = 'C:/Users/Algorithmica/Downloads/dont-overfit-ii'
14+
train = pd.read_csv(os.path.join(dir, 'train.csv'))
15+
print(train.info())
16+
print(train.columns)
17+
18+
#filter unique value features
19+
train1 = train.iloc[:,2:]
20+
y = train['target'].astype(int)
21+
22+
#filter zero-variance features
23+
variance = feature_selection.VarianceThreshold()
24+
train2 = variance.fit_transform(train1)
25+
26+
rf_estimator = ensemble.RandomForestClassifier()
27+
rf_grid = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) }
28+
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, train1, y)
29+
embedded_selector = feature_selection.SelectFromModel(rf_final_estimator, prefit=True, threshold='mean')
30+
train3 = embedded_selector.transform(train1)
31+
utils.plot_feature_importances(rf_final_estimator,train1, cutoff=50)
32+
33+
et_estimator = ensemble.ExtraTreesClassifier()
34+
et_grid = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) }
35+
et_final_estimator = cutils.grid_search_best_model(et_estimator, et_grid, train1, y)
36+
embedded_selector = feature_selection.SelectFromModel(et_final_estimator, prefit=True, threshold='mean')
37+
train3 = embedded_selector.transform(train1)
38+
utils.plot_feature_importances(et_final_estimator,train1, cutoff=50)
39+
40+
gb_estimator = ensemble.GradientBoostingClassifier()
41+
gb_grid = {'max_depth':[1,2,3], 'n_estimators':list(range(50,300, 100)), 'learning_rate':[0.001, 0.1, 1.0] }
42+
gb_final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid, train1, y)
43+
embedded_selector = feature_selection.SelectFromModel(gb_final_estimator, prefit=True, threshold='mean')
44+
X_train1 = embedded_selector.transform(train1)
45+
utils.plot_feature_importances(gb_final_estimator, train1)
46+
47+
kernel_svm_estimator = svm.SVC(kernel='rbf')
48+
kernel_svm_grid = {'gamma':[0.01, 0.1, 1, 2, 5, 10], 'C':[0.001, 0.01, 0.1, 0.5] }
49+
final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, train1, y)
50+
embedded_selector = feature_selection.SelectFromModel(final_estimator, prefit=True, threshold='mean')
51+
X_train1 = embedded_selector.transform(train1)
52+
utils.plot_feature_importances(final_estimator, train1)
53+
54+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train_pca, y, test_size=0.1, random_state=1)
55+
56+
sns.countplot(x='target',data=train)
57+
58+
kernel_svm_estimator = svm.SVC(kernel='rbf')
59+
kernel_svm_grid = {'gamma':[0.01, 0.1, 1, 2, 5, 10], 'C':[0.001, 0.01, 0.1, 0.5] }
60+
final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train)
61+
62+
print(final_estimator.score(X_eval, y_eval))
63+
64+
test = pd.read_csv(os.path.join(dir, 'test.csv'))
65+
print(test.info())
66+
print(test.columns)
67+
68+
test1 = test.iloc[:,1:]
69+
test2 = variance.transform(test1)
70+
test_pca = lpca.transform(test2)
71+
test['target'] = final_estimator.predict(test_pca)
72+
test.to_csv(os.path.join(dir, 'submission.csv'), columns=['id', 'target'], index=False)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import sys
2+
sys.path.append("E:/New Folder/utils")
3+
4+
import classification_utils as cutils
5+
from sklearn import model_selection, linear_model, dummy
6+
7+
#binary classification
8+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.4,0.6], class_sep=1.5)
9+
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)
10+
11+
12+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
13+
cutils.plot_data_2d_classification(X_train, y_train)
14+
15+
lr_estimator = linear_model.LogisticRegression()
16+
lr_grid = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] }
17+
final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='accuracy')
18+
print(final_estimator.intercept_)
19+
print(final_estimator.coef_)
20+
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)
21+
cutils.performance_metrics_hard_binary_classification(final_estimator, X_eval, y_eval)
22+
23+
#multi class classification
24+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=4, weights=[0.3,0.3,0.2,0.2], class_sep=1.5)
25+
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)
26+
27+
28+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
29+
cutils.plot_data_2d_classification(X_train, y_train)
30+
31+
lr_estimator = linear_model.LogisticRegression()
32+
lr_grid = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] }
33+
final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='accuracy')
34+
print(final_estimator.intercept_)
35+
print(final_estimator.coef_)
36+
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)
37+
cutils.performance_metrics_hard_multiclass_classification(final_estimator, X_eval, y_eval)

0 commit comments

Comments
 (0)