In [None]:
!pip install pyspark
#Pyspark
from pyspark.sql import SQLContext, functions as f, SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,GBTClassifier
from pyspark.ml.linalg import Vectors, VectorUDT,_convert_to_vector
from pyspark.mllib.evaluation import MulticlassMetrics
#Other module
import numpy as np
import statistics
import pandas as pd
import scipy.sparse
from sklearn.metrics import classification_report
#Spark Session
def SparkConfig():
  spark = (SparkSession.builder.appName("Elephas_APP")\
  .config("spark.yarn.maxAppAttempts","2")\
  .config("spark.num.executors","100")\
  .config("spark.executor.memory","12g")\
  .config("spark.driver.memory", "12g")\
  .config("spark.cores.max", "20") \
  .config("spark.executor.cores", "12")
  .config("spark.memory.offHeap.enabled",True)\
  .config("spark.memory.offHeap.size","12g")\
  .getOrCreate())
  spark.sql("set spark.sql.legacy.timeParserPolicy")
  return spark
spark = SparkConfig()

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 63 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 62.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=e90e890d3ed7a01b3c51b769378bd2aa18c52b8d7762d6b5ec2a7d16f7006389
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load dataset

In [None]:
#Load data
df = spark.read.format("csv").options(inferSchema = True,header = True,sep = ",")\
.load("./Big Data/Data Kalapa/Dataset/Cleaned_Dataset3.csv")
string = [c for c,t in df.dtypes if t == "string"]
df = df.drop(*string)
df.show(5,False)

+-----+-------+--------+--------+--------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+--------+--------+--------+-------------------+--------+--------+--------+--------+--------+-----------+---------------+-------------------------+-----------------------+------------------------+------------------+----------------+-----------------+----------+-------+--------+----------+--------+----------+--------+----------+--------+----------+--------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+------------+--------+------------------+-----------------+--------

# Create Pipeline to transform data

In [None]:
# Select features has high corr by skew and kurtosis
def Select_Feature_To_Scale(df, lower_skew = -2, upper_skew = 2,  types = ['int','double']):
  # List contain cols after selecting
  selected_features = []
  # Get cols follow types
  feature_list = []
  for typ in types:
    cols = [c for c,t in df.dtypes if t == typ]
    feature_list = feature_list + cols
  # Loop through "feature list" to select features on Kurtoris/Skew
  pd = df.toPandas()
  for feature in feature_list:
    if pd[feature].kurtosis() < lower_skew or pd[feature].kurtosis() > upper_skew:
      selected_features.append(feature)
  return selected_features

# Create pipeline to transform
def CreatedPipeline(df,add_cols = ['birth_month',"Field_82"],label = "label"):
  bool_features = [c for c in df.columns if "is_" in c]
  cat_features = add_cols + bool_features
  num_features = list( set([c for c,t in df.dtypes if t != "string"]) - set(cat_features) - {label} )

  # Pipeline Stages List
  stages = []

  # Loop for StringIndexer and OHE for categorical variables
  for feature in cat_features:
    string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_indexed")
    encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()],
                            outputCols=[feature + "_class_vec"])
    stages = stages + [string_indexer, encoder]

  # Scale features form "Select_Feature_To_Scale"
  unscaled_features = Select_Feature_To_Scale(df)
  unscaled_assembler = VectorAssembler(inputCols=unscaled_features,outputCol="unscaled_features")
  scaler = StandardScaler(inputCol="unscaled_features",outputCol="scaled_features")
  stages = stages + [unscaled_assembler, scaler]

  # Create list of Numeric Features that Are Not Being Scaled
  num_unscaled_diff_list = list(set(num_features)-set(unscaled_features))

  # Label indexer
  label_indexer = StringIndexer(inputCol= label,outputCol=label + "_indexed")

  # Concat Numeric Features and Categorical Features
  assembler_inputs = [feature + "_class_vec" for feature in cat_features] + num_unscaled_diff_list
  assembler = VectorAssembler(inputCols=assembler_inputs,outputCol = "assembled_inputs")
  stages = stages + [label_indexer, assembler]

  #Assembler final
  assembler_final = VectorAssembler(inputCols=["scaled_features","assembled_inputs"],outputCol="features")
  stages = stages + [assembler_final]

  return Pipeline(stages=stages)

# Model training

In [None]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [None]:
def Evaluate(model,model_name,data_name,trainingSet,devSet,boolean,label = "label_indexed"):
  dev_pred = model.transform(devSet).select(label,"prediction")
  dev_sys = dev_pred.select("prediction").collect()
  dev_gol = dev_pred.select(label).collect()
  dev_sys_array = np.array([i[0] for i in dev_sys])
  dev_gol_array = np.array([i[0] for i in dev_gol])

  dev_pred_rdd = dev_pred.rdd.map(lambda row: (row["prediction"],row[label]))
  metrics_dev = MulticlassMetrics(dev_pred_rdd)
  preci = np.sum([metrics_dev.precision(i) for i in [0.0,1.0] if i in dev_sys_array])/2
  recal = np.sum([metrics_dev.recall(i) for i in [0.0,1.0] if i in dev_sys_array])/2
  f1 = np.sum([metrics_dev.fMeasure(i) for i in [0.0,1.0] if i in dev_sys_array])/2
  acc = metrics_dev.accuracy
  gini = Gini(dev_gol_array,dev_sys_array)

  print(classification_report(dev_gol_array,dev_sys_array,labels = [0.0,1.0]))
  df = pd.DataFrame({"dataset": [data_name],
                   "model": [model_name],
                   "accuracy": [acc],
                   "precision": [preci],
                   "recal": [recal],
                   "f1_score": [f1],
                   "gini_core": [gini]})
  display(df)
  df.to_csv(f"./Big Data/Data Kalapa/SourceCode/KichBanB1_Result.csv",header = boolean,index = 0,mode = "a")
def Training(estimator,model_name,data_name,trainingSet,devSet,boolean = False):
  print(f"==================={model_name}===================")
  model = estimator.fit(trainingSet)
  Evaluate(model,model_name,data_name,trainingSet,devSet,boolean = boolean)
  return model

/content/drive/.shortcut-targets-by-id/1jpmnPqYOybAHB2aghnIs2YA5gydXzH5o/Big Data/Data Kalapa/SourceCode


In [None]:
DTC = DecisionTreeClassifier(labelCol="label_indexed")
RFC = RandomForestClassifier(labelCol="label_indexed")
GBTC = GBTClassifier(maxIter = 5,labelCol="label_indexed")

# Experiment with 3 Datasets

In [None]:
def FeatureInfo(f):
  for w in range(6):
    f_filter = [i for i in f.indices if f[int(i)] > w/len(f.indices)]
    print("Num of filtered{} features: {}".format(w,len(f_filter)))
    print(f_filter)

## Original Dataset (**DATASET 1**)

In [None]:
df1 = df.select(df.columns[:91])
print("NumOfColumn: {}".format(len(df1.columns)))

pip1_transform = CreatedPipeline(df1,add_cols = ["Field_82"])
trans_df1_ = pip1_transform.fit(df1)
trans_df1_.write().save("./Big Data/Data Kalapa/SourceCode/Pipeline1B")
trans_df1 = trans_df1_.transform(df1) 
trainingSet1, devSet1 = trans_df1.randomSplit([.9,.1],seed = 2021)
trainingSet1.groupby("label").count().show()
#display(trainingSet1.select("features").first())
#input_dim = len(trainingSet1.select("features").first()[0])
#print("Input dim: ",input_dim)

NumOfColumn: 91


In [None]:
model11 = Training(DTC,"Decision Tree","Dataset1",trainingSet1,devSet1,boolean = True)
f11 = model11.featureImportances
FeatureInfo(f11)

model21 = Training(RFC,"Random Forest","Dataset1",trainingSet1,devSet1)
f21 = model21.featureImportances
FeatureInfo(f21)

model31 = Training(GBTC,"Gradient Boosted Trees","Dataset1",trainingSet1,devSet1)
f31 = model31.featureImportances
FeatureInfo(f31)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset1,Decision Tree,0.72536,0.592991,0.70283,0.5891,0.143641


Num of filtered0 features: 16
[0, 20, 28, 30, 31, 34, 36, 43, 46, 51, 52, 61, 65, 66, 71, 89]
Num of filtered1 features: 3
[30, 31, 36]
Num of filtered2 features: 3
[30, 31, 36]
Num of filtered3 features: 2
[30, 31]
Num of filtered4 features: 1
[30]
Num of filtered5 features: 1
[30]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset1,Random Forest,0.720192,0.579132,0.702117,0.567682,0.127861


Num of filtered0 features: 79
[0, 1, 2, 4, 5, 7, 9, 10, 11, 12, 15, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 92]
Num of filtered1 features: 14
[2, 23, 24, 25, 28, 29, 30, 31, 36, 41, 52, 73, 77, 89]
Num of filtered2 features: 5
[28, 29, 30, 31, 77]
Num of filtered3 features: 5
[28, 29, 30, 31, 77]
Num of filtered4 features: 4
[28, 29, 30, 77]
Num of filtered5 features: 4
[28, 29, 30, 77]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset1,Gradient Boosted Trees,0.729236,0.600781,0.70755,0.60008,0.149825


Num of filtered0 features: 47
[0, 1, 3, 9, 19, 20, 22, 23, 28, 30, 31, 33, 34, 36, 40, 41, 43, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 58, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 78, 81, 82, 83, 86, 89]
Num of filtered1 features: 16
[0, 19, 20, 23, 28, 30, 31, 33, 34, 36, 40, 46, 66, 73, 82, 89]
Num of filtered2 features: 8
[19, 20, 28, 30, 31, 36, 40, 66]
Num of filtered3 features: 4
[20, 28, 30, 31]
Num of filtered4 features: 1
[30]
Num of filtered5 features: 1
[30]


## Add addition cols is seperated from timestamp (**DATASET 2**)

In [None]:
df2 = df.select(df.columns[:158])
print("NumOfColumn: {}".format(len(df2.columns)))

pip2_transform = CreatedPipeline(df2)
trans_df2_ = pip2_transform.fit(df2)
trans_df2_.write().save("./Big Data/Data Kalapa/SourceCode/Pipeline2B")
trans_df2 = trans_df2_.transform(df2)
trainingSet2, devSet2 = trans_df2.randomSplit([.9,.1],seed = 2021)
trainingSet2.groupby("label").count().show()
#display(trainingSet2.select("features").first())
#input_dim = len(trainingSet2.select("features").first()[0])
#print("Input dim: ",input_dim)

NumOfColumn: 158
+-----+-----+
|label|count|
+-----+-----+
|    1|15099|
|    0|32513|
+-----+-----+



In [None]:
model12 = Training(DTC,"Decision Tree","Dataset2",trainingSet2,devSet2)
f12 = model12.featureImportances
FeatureInfo(f12)

model22 = Training(RFC,"Random Forest","Dataset2",trainingSet2,devSet2)
f22 = model22.featureImportances
FeatureInfo(f22)

model32 = Training(GBTC,"Gradient Boosted Trees","Dataset2",trainingSet2,devSet2)
f32 = model32.featureImportances
FeatureInfo(f32)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset2,Decision Tree,0.724437,0.59873,0.692053,0.598428,0.112045


Num of filtered0 features: 15
[7, 16, 35, 44, 58, 59, 79, 90, 92, 102, 122, 130, 139, 159, 160]
Num of filtered1 features: 3
[58, 59, 102]
Num of filtered2 features: 2
[58, 59]
Num of filtered3 features: 2
[58, 59]
Num of filtered4 features: 1
[58]
Num of filtered5 features: 1
[58]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset2,Random Forest,0.715762,0.568535,0.699652,0.550372,0.11757


Num of filtered0 features: 105
[0, 1, 3, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 19, 21, 23, 25, 26, 28, 31, 33, 35, 36, 37, 38, 39, 41, 42, 44, 53, 57, 58, 59, 60, 66, 69, 70, 76, 79, 80, 81, 82, 83, 85, 86, 87, 89, 90, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 105, 107, 108, 109, 110, 112, 114, 115, 117, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 135, 137, 139, 140, 143, 147, 148, 150, 151, 152, 153, 154, 155, 157, 158, 159, 160, 161, 162, 164, 165, 167, 168, 169, 170, 171]
Num of filtered1 features: 18
[7, 13, 23, 26, 41, 44, 57, 58, 59, 60, 90, 92, 122, 123, 127, 139, 159, 167]
Num of filtered2 features: 9
[7, 13, 44, 57, 58, 59, 92, 123, 167]
Num of filtered3 features: 4
[44, 57, 58, 59]
Num of filtered4 features: 4
[44, 57, 58, 59]
Num of filtered5 features: 3
[44, 57, 58]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset2,Gradient Boosted Trees,0.726283,0.600394,0.696764,0.600394,0.11884


Num of filtered0 features: 62
[0, 2, 7, 13, 15, 16, 17, 25, 35, 36, 38, 39, 44, 57, 58, 59, 79, 81, 82, 83, 85, 86, 90, 91, 92, 94, 98, 101, 102, 103, 107, 108, 109, 110, 112, 113, 117, 119, 122, 125, 128, 129, 130, 132, 133, 137, 139, 140, 142, 143, 144, 147, 148, 149, 150, 154, 157, 159, 160, 161, 168, 170]
Num of filtered1 features: 20
[16, 35, 36, 38, 39, 44, 58, 59, 79, 83, 94, 102, 122, 125, 130, 139, 140, 159, 160, 161]
Num of filtered2 features: 10
[16, 35, 36, 39, 44, 58, 59, 79, 102, 122]
Num of filtered3 features: 5
[36, 44, 58, 59, 102]
Num of filtered4 features: 2
[58, 59]
Num of filtered5 features: 1
[58]


## Add all addition cols (**DATASET 3**)

In [None]:
df3 = df
print("NumOfColumn: {}".format(len(df.columns)))
pip3_transform = CreatedPipeline(df3)
trans_df3_ = pip3_transform.fit(df3)
trans_df3_.write().save("./Big Data/Data Kalapa/SourceCode/Pipeline3B")
trans_df3 = trans_df3_.transform(df3)
trainingSet3, devSet3 = trans_df3.randomSplit([.9,.1],seed = 2021)

#display(trainingSet3.select("features").first())
#input_dim = len(trainingSet3.select("features").first()[0])
#print("Input dim: ",input_dim)

NumOfColumn: 168


In [None]:
model13 = Training(DTC,"Decision Tree","Dataset3",trainingSet3,devSet3)
f13 = model13.featureImportances
FeatureInfo(f13)

model23 = Training(RFC,"Random Forest","Dataset3",trainingSet3,devSet3)
f23 = model23.featureImportances
FeatureInfo(f23)

model33 = Training(GBTC,"Gradient Boosted Trees","Dataset3",trainingSet3,devSet3)
f33 = model33.featureImportances
FeatureInfo(f33)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset3,Decision Tree,0.724437,0.59873,0.692053,0.598428,0.112045


Num of filtered0 features: 15
[7, 19, 38, 47, 61, 62, 85, 96, 99, 110, 132, 140, 150, 170, 171]
Num of filtered1 features: 3
[61, 62, 110]
Num of filtered2 features: 2
[61, 62]
Num of filtered3 features: 2
[61, 62]
Num of filtered4 features: 1
[61]
Num of filtered5 features: 1
[61]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset3,Random Forest,0.714655,0.557472,0.72712,0.527841,0.130231


Num of filtered0 features: 106
[1, 2, 5, 6, 7, 10, 11, 12, 13, 14, 15, 17, 19, 21, 22, 24, 25, 26, 29, 31, 34, 35, 37, 38, 39, 41, 42, 45, 47, 48, 51, 54, 57, 60, 61, 62, 63, 73, 77, 82, 85, 88, 89, 91, 92, 93, 96, 98, 99, 100, 102, 104, 106, 107, 108, 109, 110, 113, 114, 115, 116, 117, 120, 122, 123, 125, 126, 130, 132, 133, 135, 136, 137, 138, 139, 140, 142, 143, 144, 146, 147, 148, 150, 151, 152, 154, 155, 158, 159, 162, 163, 164, 165, 166, 168, 170, 171, 172, 173, 176, 177, 178, 180, 181, 182, 183]
Num of filtered1 features: 21
[21, 24, 26, 29, 31, 35, 38, 47, 60, 61, 62, 93, 96, 98, 99, 102, 132, 140, 170, 171, 173]
Num of filtered2 features: 10
[21, 24, 29, 35, 38, 47, 60, 61, 62, 171]
Num of filtered3 features: 4
[47, 60, 61, 62]
Num of filtered4 features: 4
[47, 60, 61, 62]
Num of filtered5 features: 4
[47, 60, 61, 62]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset3,Gradient Boosted Trees,0.726283,0.600394,0.696764,0.600394,0.11884


Num of filtered0 features: 62
[0, 2, 7, 13, 15, 19, 20, 28, 38, 39, 41, 42, 47, 60, 61, 62, 85, 87, 88, 89, 91, 92, 96, 97, 99, 102, 106, 109, 110, 111, 115, 116, 117, 118, 120, 121, 125, 128, 132, 135, 138, 139, 140, 142, 143, 148, 150, 151, 153, 154, 155, 158, 159, 160, 161, 165, 168, 170, 171, 172, 179, 181]
Num of filtered1 features: 20
[19, 38, 39, 41, 42, 47, 61, 62, 85, 89, 102, 110, 132, 135, 140, 150, 151, 170, 171, 172]
Num of filtered2 features: 10
[19, 38, 39, 42, 47, 61, 62, 85, 110, 132]
Num of filtered3 features: 5
[39, 47, 61, 62, 110]
Num of filtered4 features: 2
[61, 62]
Num of filtered5 features: 1
[61]


**Tiếp theo, thực hiện sử dụng thuộc tính rút trích để chạy model để thực hiện so sánh với TH gốc, các kịch bản vạch ra:**

*  Sử dụng bộ dataset cho kết quả tốt nhất (**DATASET2**)
*  Sử dụng tree-based tốt nhất để rút trích thuộc tính quang trọng (**GBT**)
  * Sử dụng toàn phần thuộc tính quan trọng
  * Sử dụng một phần thuộc tính quan trọng
* Chạy lại 3 model tree-based
* Thực hiện so sánh





========================================================================

# Model using impo features for DATASET2

In [None]:
def dense_to_sparse(vector):
  return _convert_to_vector(scipy.sparse.csc_matrix(vector.toArray()).T)
to_sparse = f.udf(dense_to_sparse, VectorUDT())
trainingSet2 = trainingSet2.withColumn("features",to_sparse("features"))
devSet2 = devSet2.withColumn("features",to_sparse("features"))

**Full impo features**

In [None]:
def Convert1(x,f):
  idx_impo = {i: float(0.0) for i in f.indices}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert1 = f.udf(lambda x: Convert1(x,f32),VectorUDT())
trainingSet12 = trainingSet2.withColumn("features",convert1("features"))
devSet12 = devSet2.withColumn("features",convert1("features"))

In [None]:
model112 = Training(DTC,"Decision Tree","Full",trainingSet12,devSet12)
model122 = Training(RFC,"Random Forest","Full",trainingSet12,devSet12)
model132 = Training(GBTC,"Gradient Boosted Trees","Full",trainingSet12,devSet12)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Full,Decision Tree,0.724437,0.59873,0.692053,0.598428,0.112045




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Full,Random Forest,0.723699,0.586333,0.706279,0.578634,0.130804




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Full,Gradient Boosted Trees,0.726283,0.600394,0.696764,0.600394,0.11884


**A part of impo features (filtered1)**

In [None]:
def Convert2(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 1/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert2 = f.udf(lambda x: Convert2(x,f32),VectorUDT())
trainingSet22 = trainingSet2.withColumn("features",convert2("features"))
devSet22 = devSet2.withColumn("features",convert2("features"))

In [None]:
model212 = Training(DTC,"Decision Tree","Filtered2",trainingSet22,devSet22)
model222 = Training(RFC,"Random Forest","Filtered2",trainingSet22,devSet22)
model232 = Training(GBTC,"Gradient Boosted Trees","Filtered2",trainingSet22,devSet22)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2,Decision Tree,0.724437,0.597608,0.693255,0.596722,0.112481




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2,Random Forest,0.726283,0.592381,0.707888,0.587718,0.154329




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2,Gradient Boosted Trees,0.72739,0.601521,0.699475,0.60177,0.12702


**A part of impo features (filtered2)**

In [None]:
def Convert3(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 2/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert3 = f.udf(lambda x: Convert3(x,f32),VectorUDT())
trainingSet32 = trainingSet2.withColumn("features",convert3("features"))
devSet32 = devSet2.withColumn("features",convert3("features"))

In [None]:
model312 = Training(DTC,"Decision Tree","Filtered2",trainingSet32,devSet32)
model322 = Training(RFC,"Random Forest","Filtered2",trainingSet32,devSet32)
model332 = Training(GBTC,"Gradient Boosted Trees","Filtered2",trainingSet32,devSet32)

              precision    recall  f1-score   support

         0.0       0.73      0.93      0.82      3721
         1.0       0.64      0.25      0.36      1697

    accuracy                           0.72      5418
   macro avg       0.69      0.59      0.59      5418
weighted avg       0.70      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2,Decision Tree,0.721299,0.594202,0.685799,0.592502,0.139543


              precision    recall  f1-score   support

         0.0       0.73      0.93      0.82      3721
         1.0       0.64      0.26      0.37      1697

    accuracy                           0.72      5418
   macro avg       0.69      0.60      0.60      5418
weighted avg       0.71      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2,Random Forest,0.722776,0.5972,0.687908,0.596614,0.117351


              precision    recall  f1-score   support

         0.0       0.73      0.94      0.82      3721
         1.0       0.65      0.25      0.36      1697

    accuracy                           0.72      5418
   macro avg       0.69      0.59      0.59      5418
weighted avg       0.71      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2,Gradient Boosted Trees,0.723145,0.594424,0.692131,0.592219,0.117681


**A part of impo features (filtered3)**

In [None]:
def Convert4(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 3/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert4 = f.udf(lambda x: Convert4(x,f32),VectorUDT())
trainingSet42 = trainingSet2.withColumn("features",convert4("features"))
devSet42 = devSet2.withColumn("features",convert4("features"))

In [None]:
model412 = Training(DTC,"Decision Tree","Filtered3",trainingSet42,devSet42)
model422 = Training(RFC,"Random Forest","Filtered3",trainingSet42,devSet42)
model432 = Training(GBTC,"Gradient Boosted Trees","Filtered3",trainingSet42,devSet42)

              precision    recall  f1-score   support

         0.0       0.73      0.93      0.82      3721
         1.0       0.63      0.26      0.37      1697

    accuracy                           0.72      5418
   macro avg       0.68      0.60      0.60      5418
weighted avg       0.70      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered3,Decision Tree,0.721299,0.596606,0.683594,0.59616,0.10414


              precision    recall  f1-score   support

         0.0       0.74      0.94      0.82      3721
         1.0       0.65      0.26      0.37      1697

    accuracy                           0.72      5418
   macro avg       0.69      0.60      0.60      5418
weighted avg       0.71      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered3,Random Forest,0.724068,0.59798,0.691551,0.597404,0.114413


              precision    recall  f1-score   support

         0.0       0.73      0.93      0.82      3721
         1.0       0.64      0.26      0.37      1697

    accuracy                           0.72      5418
   macro avg       0.69      0.60      0.60      5418
weighted avg       0.70      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered3,Gradient Boosted Trees,0.722038,0.596342,0.686249,0.59554,0.138979


**A part of impo features (filtered4)**

In [None]:
def Convert5(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 4/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert5 = f.udf(lambda x: Convert4(x,f32),VectorUDT())
trainingSet52 = trainingSet2.withColumn("features",convert5("features"))
devSet52 = devSet2.withColumn("features",convert5("features"))

In [None]:
model512 = Training(DTC,"Decision Tree","Filtered4",trainingSet52,devSet52)
model522 = Training(RFC,"Random Forest","Filtered4",trainingSet52,devSet52)
model532 = Training(GBTC,"Gradient Boosted Trees","Filtered4",trainingSet52,devSet52)

              precision    recall  f1-score   support

         0.0       0.73      0.93      0.82      3721
         1.0       0.63      0.26      0.37      1697

    accuracy                           0.72      5418
   macro avg       0.68      0.60      0.60      5418
weighted avg       0.70      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered4,Decision Tree,0.721299,0.596606,0.683594,0.59616,0.10414


              precision    recall  f1-score   support

         0.0       0.74      0.94      0.82      3721
         1.0       0.65      0.26      0.37      1697

    accuracy                           0.72      5418
   macro avg       0.69      0.60      0.60      5418
weighted avg       0.71      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered4,Random Forest,0.724068,0.59798,0.691551,0.597404,0.114413


              precision    recall  f1-score   support

         0.0       0.73      0.93      0.82      3721
         1.0       0.64      0.26      0.37      1697

    accuracy                           0.72      5418
   macro avg       0.69      0.60      0.60      5418
weighted avg       0.70      0.72      0.68      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered4,Gradient Boosted Trees,0.722038,0.596342,0.686249,0.59554,0.138979


**A part of impo features (filtered5)**

In [None]:
def Convert6(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 5/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert6 = f.udf(lambda x: Convert6(x,f32),VectorUDT())
trainingSet62 = trainingSet2.withColumn("features",convert6("features"))
devSet62 = devSet2.withColumn("features",convert6("features"))

In [None]:
model612 = Training(DTC,"Decision Tree","Filtered5",trainingSet62,devSet62)
model622 = Training(RFC,"Random Forest","Filtered5",trainingSet62,devSet62)
model632 = Training(GBTC,"Gradient Boosted Trees","Filtered5",trainingSet62,devSet62)

              precision    recall  f1-score   support

         0.0       0.72      0.91      0.80      3721
         1.0       0.53      0.22      0.32      1697

    accuracy                           0.69      5418
   macro avg       0.62      0.57      0.56      5418
weighted avg       0.66      0.69      0.65      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered5,Decision Tree,0.694168,0.566436,0.623731,0.559071,0.038526


              precision    recall  f1-score   support

         0.0       0.72      0.91      0.80      3721
         1.0       0.53      0.22      0.32      1697

    accuracy                           0.69      5418
   macro avg       0.62      0.57      0.56      5418
weighted avg       0.66      0.69      0.65      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered5,Random Forest,0.694168,0.566436,0.623731,0.559071,0.038526


              precision    recall  f1-score   support

         0.0       0.72      0.91      0.80      3721
         1.0       0.53      0.22      0.32      1697

    accuracy                           0.69      5418
   macro avg       0.62      0.57      0.56      5418
weighted avg       0.66      0.69      0.65      5418



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered5,Gradient Boosted Trees,0.694168,0.566436,0.623731,0.559071,0.038526


========================================================================


In [None]:
model32.write().save("./Big Data/Data Kalapa/SourceCode/ModelB")

# Extract features importances
* **Khái niệm về sparse vector** <br>
Sparse vector sử dụng khi các weight của 1 vector bằng 0 nhiều<br>
Ví dụ: Ta có một vector 500 chiều, với chiều ở index 0 có giá trị 1.0, index 335 có giá trị 2.0, index = 499 có giá trị 3.0, **tất cả index còn lại có giá trị 0**<br>
=> ***Sparse Vector***: 
 * Dạng từ điển: (500,{1: 1.0, 335: 2.0, 499: 3.0}-**index: value**)
 * Dạng kiểu list: (500, [1, 335, 449]- **Danh sách index**, [1.0, 2.0, 3.0]-**Danh sách values/weights**)
* **Xử lý với các đặc trưng quan trọng có dạng sparse vector**
 * Đầu vào: (5000 , [28,33,449], [1.0,2.0,3.0])
 * Đặc trưng quan trọng: (500, [1,2,28,29,55],[0.01,0.02,0.37,0.4,0.2])
<b>Bước 1</b>: Tạo từ điển với các index trong "Đặc trưng quan trọng"<br>
Từ điển: {1: 0.0, 2: 0.0, 28: 0.0,29: 0.0,55: 0.0}<br>
<b>Bước 2</b>: Điền vào từ điển với các ***giá trị trong đầu vào*** nếu ***index của đầu vào có trong từ điển***<br>
Từ điển: {1: 0.0, 2: 0.0, **28: 1.0**, 29: 0.0, 55: 0.0}<br>
<b>Bước 3</b>: Tạo một Dense Vector từ các giá trị trong từ điển<br>
[0.0, 0.0,**1.0**,0.0,0.]<br>

==> Kết quả sau khi xử lý: [0.0, 0.0,**1.0**,0.0,0.]


