In [1]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.datasets import make_classification

In [2]:
from imblearn.over_sampling import SMOTE
import pandas as pd 
import numpy as np
from collections import Counter
import pyspark.pandas as ps
import findspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession \
    .builder \
    .appName("resample") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .master("local[*]") \
    .getOrCreate()



#### SMOTE

In [15]:
X, y = make_classification(n_samples=1000, n_features=4, n_informative=2,
                            n_redundant=0, n_repeated=0, n_classes=4,
                            n_clusters_per_class=1,
                           weights=[0.4, 0.2, 0.15, 0.25],
                            class_sep=0.3)

data = pd.DataFrame(X, columns=['x1', 'x2', 'x3', 'x4'])
data = pd.concat([data, pd.Series(y, name='y')], axis=1)
data_columns = data.columns
data

Unnamed: 0,x1,x2,x3,x4,y
0,1.978243,-0.015516,0.552249,0.605695,3
1,-0.934981,1.737888,0.865597,0.918625,3
2,0.271393,0.255951,0.420262,0.425519,0
3,0.472396,0.603838,1.270811,0.411008,1
4,-1.606703,0.247036,2.801565,1.037413,1
...,...,...,...,...,...
995,0.577414,0.949645,-0.140962,-0.058483,2
996,-0.329691,1.308987,-1.335442,0.196809,2
997,-0.991516,-2.035151,-0.344045,-0.359092,0
998,1.879831,-0.079252,3.004532,1.198544,3


In [129]:
Counter(data[label_col[0]])

Counter({1: 23, 2: 50, 0: 27})

In [95]:
kind_nums = {'0': 10, 1: 30}
modified_kind_nums = {int(key) if str(key).isdigit() else key: value for key, value in kind_nums.items()}
modified_kind_nums

{0: 10, 1: 30}

In [112]:
all([origin_label[key] <= modified_kind_nums[key] for key in modified_kind_nums.keys()])

False

In [132]:
kind_nums = {'0': 30, 1: 30}
modified_kind_nums = {int(key) if str(key).isdigit() else key: value for key, value in kind_nums.items()}
X_resampled, y_resampled = SMOTE(sampling_strategy=modified_kind_nums, random_state=90, k_neighbors=4).fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 30), (1, 30), (2, 50)]


In [85]:
X = np.array(data[['x1', 'x2']])
y = np.array(data[['y']])

X_resampled, y_resampled = SMOTE(sampling_strategy='auto', random_state=90, k_neighbors=4).fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 48), (1, 48), (2, 48)]


In [22]:
data['y'] = data['y'].map({0:'g', 1:'h', 2:'t'})

In [26]:
X = np.array(data[['x1', 'x2']])
y = np.array(data[['y']])

X_resampled, y_resampled = SMOTE(sampling_strategy={'g':25, 'h': 30}, random_state=90, k_neighbors=4).fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[('g', 25), ('h', 30), ('t', 64)]


In [28]:
data_resampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1).reset_index(drop=True)
data_resampled.columns = data_columns

In [29]:
data_resampled

Unnamed: 0,x1,x2,y
0,-1.471079,-2.083655,2
1,-1.070979,0.756123,1
2,0.124644,1.217591,1
3,-0.238493,-0.482590,2
4,-1.288126,-0.695669,2
...,...,...,...
114,-0.439787,0.988788,1
115,0.032804,1.196339,1
116,-0.872103,0.715759,1
117,-1.530848,0.489401,1


In [184]:
# data_sp = ps.from_pandas(data).to_spark()

# data_sp.show()

# type(data_sp)

# feature_cols = data_sp.columns[:-1]
# label_col = data_sp.columns[-1]

# assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
# data_sp = assembler.transform(data_sp)

# label_indexer = StringIndexer(inputCol=label_col, outputCol="label")
# data_sp = label_indexer.fit(data_sp).transform(data_sp)

# data_sp.show()

# pandas_data = data_sp.select("features", "label").toPandas()

# X_train = pandas_data["features"].values
# y_train = pandas_data["label"].values

# oversampler = SMOTE(sampling_strategy='auto', random_state=42)
# X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

# Convert the over-sampled arrays back to a DataFrame
# oversampled_data = spark.createDataFrame(zip(X_train_oversampled.tolist(), y_train_oversampled.tolist()), ["features", "label"])

#### UnderSampling

In [58]:
X, y = make_classification(n_samples=1000, n_features=4, n_informative=3,
                           n_redundant=0, n_repeated=0, n_classes=5,
                           n_clusters_per_class=1,
                           weights=[0.18, 0.12, 0.4, 0.08, 0.22],
                           class_sep=0.8, random_state=56)

In [59]:
print(sorted(Counter(y).items()))

[(0, 181), (1, 120), (2, 399), (3, 81), (4, 219)]


In [65]:
data = pd.DataFrame(X, columns=['x1', 'x2', 'x3', 'x4'])
data = pd.concat([data, pd.Series(y, name='y')], axis=1)
data_columns = data.columns
data

Unnamed: 0,x1,x2,x3,x4,y
0,1.193535,1.187104,-1.111734,0.527478,0
1,-0.393377,-0.075908,-0.460103,0.654180,4
2,0.261888,-1.195204,2.217908,-0.677320,2
3,0.748441,1.012602,-1.011012,-0.565263,0
4,-0.466457,0.436402,3.613897,2.937681,2
...,...,...,...,...,...
995,-0.338276,-0.002876,-0.732737,-0.143404,4
996,-0.356604,0.417834,-1.898244,2.551995,2
997,-0.777908,-1.098488,-0.737883,1.456472,4
998,-0.830709,-1.363576,3.088868,0.632730,2


In [64]:
from imblearn.under_sampling import RandomUnderSampler
X_resampled, y_resampled = RandomUnderSampler(sampling_strategy='auto', random_state=0).fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 81), (1, 81), (2, 81), (3, 81), (4, 81)]


In [66]:
data_resampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1).reset_index(drop=True)
data_resampled.columns = data_columns

In [67]:
data_resampled

Unnamed: 0,x1,x2,x3,x4,y
0,-0.741482,2.054190,-0.986116,1.166903,0
1,0.700590,0.221004,-1.622952,0.095383,0
2,0.081790,0.908561,-1.050450,-0.249155,0
3,0.442723,0.886671,-0.178275,-1.663296,0
4,-0.859336,0.057242,-0.942352,-0.152584,0
...,...,...,...,...,...
400,1.481106,-1.212811,-1.143307,0.537810,4
401,1.626526,-0.530095,-0.576041,0.947944,4
402,-0.540913,0.106516,-0.637565,-0.014161,4
403,0.675251,0.033790,-0.274642,0.883192,4


In [78]:
from imblearn.under_sampling import RandomUnderSampler
X_resampled, y_resampled = RandomUnderSampler(sampling_strategy={0:150, 2:40, 4:22}, random_state=0).fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 150), (1, 120), (2, 40), (3, 81), (4, 22)]


#### Chi-Square test

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
# Create a Spark session
import findspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession \
    .builder \
    .appName("test") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .master("local[*]") \
    .getOrCreate()

In [2]:
import numpy as np
import pandas as pd
import pyspark.pandas as ps
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, lit, format_number



In [82]:
data_pandas = pd.DataFrame({
    'A1': [1, 2, 3.5, 6, 7],
    'B2': [3, 5.6, 0, 0, 0.9],
    'FF': [33, 15.6, 20, 40, 90.9],
    'label': [1, 2, 1, 2, 1]})
dataset = ps.from_pandas(data_pandas).to_spark()
dataset.show()

+---+---+----+-----+
| A1| B2|  FF|label|
+---+---+----+-----+
|1.0|3.0|33.0|    1|
|2.0|5.6|15.6|    2|
|3.5|0.0|20.0|    1|
|6.0|0.0|40.0|    2|
|7.0|0.9|90.9|    1|
+---+---+----+-----+



In [83]:
from pyspark.sql.functions import bround, when

In [84]:
inputCols = ["A1", "B2", "FF"]
vector_assembler = VectorAssembler(inputCols=inputCols, outputCol="features")
dataset = vector_assembler.transform(dataset)
dataset.show()

+---+---+----+-----+--------------+
| A1| B2|  FF|label|      features|
+---+---+----+-----+--------------+
|1.0|3.0|33.0|    1|[1.0,3.0,33.0]|
|2.0|5.6|15.6|    2|[2.0,5.6,15.6]|
|3.5|0.0|20.0|    1|[3.5,0.0,20.0]|
|6.0|0.0|40.0|    2|[6.0,0.0,40.0]|
|7.0|0.9|90.9|    1|[7.0,0.9,90.9]|
+---+---+----+-----+--------------+



In [85]:
res = ChiSquareTest.test(dataset=dataset, featuresCol='features', labelCol='label', flatten=True)

In [86]:
expr = when(col("featureIndex") == 0, inputCols[0])
for index, inputCol in enumerate(inputCols[1:], start=1):
    expr = expr.when(col("featureIndex") == index, inputCol)
expr = expr.otherwise(None)
res = res.withColumn('features', expr)

In [87]:
res = res.select('features', bround('pValue', scale=3).alias('pValue'), 'degreesOfFreedom', bround('statistic', scale=3).alias('statistic'))
res.show()

+--------+------+----------------+---------+
|features|pValue|degreesOfFreedom|statistic|
+--------+------+----------------+---------+
|      A1| 0.287|               4|      5.0|
|      B2| 0.405|               3|    2.917|
|      FF| 0.287|               4|      5.0|
+--------+------+----------------+---------+

