In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, add_dummy_feature, FunctionTransformer, PolynomialFeatures, KBinsDiscretizer, OrdinalEncoder, OneHotEncoder, MultiLabelBinarizer
from sklean.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_regression, SelectPercentile, RFE
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.datasets import fetch_california_housing, make_friedman1
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [None]:
dataset.info()
dataset.corr()
dataset.describe().T

dataset.hist()
dataset.plot.kde()
abalone_data.plot.box(vert=False)

sns.set(style = 'whitegrid')
sns.violinplot(data=abalone_data, orient="h", scale="width")

dataset.isnull().sum()
dataset.isnull().sum().sum()

dataset.price_range.unique()
dataset.price_range.nunique()

dataset.price_range.eq('?')

price_range.replace('?', np.nan, inplace = True)
data.replace({'Sex': {'M':0, 'F':1, 'I':2}})

dataset.value_counts()
(dataset.price_range==1).sum()

y = dataset.pop('price_range')

y = dataset.price_range
X = dataset.drop(["price_range"],axis=1)
X = dataset.drop(columns = ["price_range"])

df[dataset.price_range.eq(1)]
df[df.name.str[0] == 'W']

In [None]:
#class.fit_transform()

dv = DictVectorizer(sparse = False)

si = SimpleImputer(missing_values = np.nan, strategy = 'mean')

mas = MaxAbsScaler()
mm = MinMaxScaler

ss = StandardScaler()

X_new = add_dummy_feature(X)

poly = PolynomialFeatures(degree = 2)

In [None]:
'''
Function Transformer : log(1 + p)
'''
transformer = FunctionTransformer(np.log1p, validate = True)
wine_data_transformed = transformer.transform(np.array(wine_data))
pd.DataFrame(wine_data_transformed, columns = wine_data.columns).describe().T

In [None]:
'''
KBinsDiscretizer
'''
enc = KBinsDiscretizer(n_bins = 10, encode = 'onehot')
X = np.array(wine_data['chlorides']).reshape(-1,1)
enc.fit_transform(X)

In [None]:
'''
OneHotEncoder
'''
onehotencode = OneHotEncoder(categories = 'auto')
onehotencode.fit_transform(iris_data.labels.values.reshape(-1,1))

'''
OrdinalEncoder
'''
enc = OrdinalEncoder()
enc.fit_transform(iris_data.labels.reshape(-1,1))

'''
MultiLabelBinarizer
'''
mlb = MultiLabelBinarizer()

'''
get_dummies
'''
iris_data_onehot = pd.get_dummies(iris_data, columns = ['label'], prefix = 'one_hot')

In [None]:
'''
ColumnTransformer
'''
ct = ColumnTransformer([
    ('scaler', MinxMaxScaler(), [0]),
    ('pass', 'passthrough', [0]),
    ('encoder', OneHotEncoder(), [1]),
])
ct.fit_transform(iris_data)

In [None]:
'''
TransformedTargetRegressor
'''
transformer = MinMaxScaler()
regressor = LinearRegression()
regr = TransformedTargetRegressor(regressor = regressor, transformer = transformer)
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

In [None]:
'''
VarianceThreshold
'''
variance_threshold = VarianceThreshold(threshold = 9)

'''
Selectkbest
'''
skb = SelectKBest(mutual_info_regression, k = 3)
skb.get_feature_names_out()

'''
SelectPercentile
'''
sp = SelectPercentile(mutual_info_regression, percentile = 30)

'''
Recursive Feature Elimination
'''
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select = 3, step = 1)
selector.fit(X_train, y_train)
print(selector.support_) # boolean array of selected features
print(selector.ranking_) # ranking of features


In [None]:
'''
Pipeline
'''
estimators = [
    ('SimpleImputer', SimpleImputer()),
    ("PCA", PCA()),
    ('StandardScaler', StandardScaler()),
]
pipeline = Pipeline(step = estimators)

pipe.step[0]
pipe[0]
pipe['SimpleImputer']

pipe.set_params(PCA__n_components = 2)
pipe.get_params(SimpleImputer__strategy = 'mean')

In [None]:
'''
GridSearchCV
'''
param_grid = dict(
    imputer = ['passthrough', SimpleImputer(), KNNImputer()],
    clf = [SVC(), LogisticRegression()],
    clf__C = [0.1, 1, 10, 100, 1000],
)
grid_search = GridSearchCV(pipe, param_grid, cv = 5)

In [None]:
'''
RandomUnderSampler
'''
undersample = RandomUnderSampler(random_state = 0)
X_rus, y_rus = undersample.fit_resample(X_train, y_train)

'''
RandomOverSampler
'''
oversample = RandomOverSampler(random_state = 0)
X_ros, y_ros = oversample.fit_resample(X_train, y_train)

'''
SMOTE
'''
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size= 0.2,random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train,y_train)
y_predict=knn.predict(x_test)
accuracy=accuracy_score(y_test,y_predict)
print(accuracy)
cm=confusion_matrix(y_test,y_predict)
print(cm)

In [None]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(x_train,y_train)
y_predict=svm.predict(x_test)
accuracy=accuracy_score(y_test,y_predict)
print(accuracy)
cm=confusion_matrix(y_test,y_predict)
print(cm)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, x_train, y_train, cv=10)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(x_train, y_train)
clf.score(x_train,y_train)

In [None]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(base_estimator=SVC(),n_estimators=10, random_state=0).fit(X,y)
clf.score(x_train,y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.simplefilter("ignore")
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1.fit(X, y)
eclf1.score(X, y)