In [None]:
!pip install pyspark
#Pyspark
from pyspark.sql import SQLContext, functions as f, SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler, PCA
from pyspark.ml import Pipeline
from pyspark.sql.functions import rand
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,GBTClassifier
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.mllib.evaluation import MulticlassMetrics
#Other module
import numpy as np
import statistics
import pandas as pd
#Spark Session
def SparkConfig():
  spark = (SparkSession.builder.appName("Elephas_APP")\
  .config("spark.yarn.maxAppAttempts","2")\
  .config("spark.num.executors","100")\
  .config("spark.executor.memory","12g")\
  .config("spark.driver.memory", "12g")\
  .config("spark.cores.max", "20") \
  .config("spark.executor.cores", "12")
  .config("spark.memory.offHeap.enabled",True)\
  .config("spark.memory.offHeap.size","12g")\
  .getOrCreate())
  spark.sql("set spark.sql.legacy.timeParserPolicy")
  return spark
spark = SparkConfig()



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load dataset

In [None]:
#Load data
df = spark.read.format("csv").options(inferSchema = True,header = True,sep = ",")\
.load("./Big Data/Data Kalapa/Dataset/Cleaned_Dataset3.csv")
df.show(5,False)

+-----+-------+-------+--------+--------+--------+-----------------------------+--------+---------+--------+--------+--------+--------+---------+--------+--------+--------+--------+--------+--------+--------------------------------------------------+--------+---------+---------+--------+--------+--------+--------+--------+------------------------------+--------+--------+--------+------------------------+--------+--------+--------+--------+--------+---------+--------------+----------------------------+--------+--------+--------+-------------------+--------+--------+--------+--------+--------+-----------+---------------+-------------------------+-----------------------+------------------------+------------------+----------------+-----------------+-------------------+----------------------+-------------------+--------------------+------------+---------------+------------+-------------+----------+-------+--------+----------+--------+----------+--------+----------+--------+----------+------

# Create Pipeline to transform data

In [None]:
# Select features has high corr by skew and kurtosis
def Select_Feature_To_Scale(df, lower_skew = -2, upper_skew = 2,  types = ['int','double']):
  # List contain cols after selecting
  selected_features = []
  # Get cols follow types
  feature_list = []
  for typ in types:
    cols = [c for c,t in df.dtypes if t == typ]
    feature_list = feature_list + cols
  # Loop through "feature list" to select features on Kurtoris/Skew
  pd = df.toPandas()
  for feature in feature_list:
    if pd[feature].kurtosis() < lower_skew or pd[feature].kurtosis() > upper_skew:
      selected_features.append(feature)
  return selected_features

# Create pipeline to transform
def CreatedPipeline(df,add_cols = ['birth_month',"Field_82"],label = "label"):
  bool_features = [c for c in df.columns if "is_" in c]
  cat_features = [c for c,t in df.dtypes if t == "string"]+ add_cols + bool_features
  num_features = list( set([c for c,t in df.dtypes if t != "string"]) - set(add_cols) - set(bool_features) - {label} )

  # Pipeline Stages List
  stages = []

  # Loop for StringIndexer and OHE for categorical variables
  for feature in cat_features:
    string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_indexed")
    encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()],
                            outputCols=[feature + "_class_vec"])
    stages = stages + [string_indexer, encoder]

  # Scale features form "Select_Feature_To_Scale"
  unscaled_features = Select_Feature_To_Scale(df)
  unscaled_assembler = VectorAssembler(inputCols=unscaled_features,outputCol="unscaled_features")
  scaler = StandardScaler(inputCol="unscaled_features",outputCol="scaled_features")
  stages = stages + [unscaled_assembler, scaler]

  # Create list of Numeric Features that Are Not Being Scaled
  num_unscaled_diff_list = list(set(num_features)-set(unscaled_features))

  # Label indexer
  label_indexer = StringIndexer(inputCol= label,outputCol=label + "_indexed")

  # Concat Numeric Features and Categorical Features
  assembler_inputs = [feature + "_class_vec" for feature in cat_features] + num_unscaled_diff_list
  assembler = VectorAssembler(inputCols=assembler_inputs,outputCol = "assembled_inputs")
  stages = stages + [label_indexer, assembler]

  #Assembler final
  assembler_final = VectorAssembler(inputCols=["scaled_features","assembled_inputs"],outputCol="features")
  stages = stages + [assembler_final]

  return Pipeline(stages=stages)

# Model training

In [None]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [None]:
def Evaluate(model,model_name,data_name,trainingSet,devSet,boolean,label = "label_indexed"):
  dev_pred = model.transform(devSet).select(label,"prediction")
  dev_sys = dev_pred.select("prediction").collect()
  dev_gol = dev_pred.select(label).collect()
  dev_sys_array = np.array([i[0] for i in dev_sys])
  dev_gol_array = np.array([i[0] for i in dev_gol])

  dev_pred_rdd = dev_pred.rdd.map(lambda row: (row["prediction"],row[label]))
  metrics_dev = MulticlassMetrics(dev_pred_rdd)
  preci = np.sum([metrics_dev.precision(i) for i in [0.0,1.0] if i in dev_sys_array])/2
  recal = np.sum([metrics_dev.recall(i) for i in [0.0,1.0] if i in dev_sys_array])/2
  f1 = np.sum([metrics_dev.fMeasure(i) for i in [0.0,1.0] if i in dev_sys_array])/2
  acc = metrics_dev.accuracy
  gini = Gini(dev_gol_array,dev_sys_array)

  df = pd.DataFrame({"dataset": [data_name],
                   "model": [model_name],
                   "accuracy": [acc],
                   "precision": [preci],
                   "recal": [recal],
                   "f1_score": [f1],
                   "gini_core": [gini]})
  display(df)
  df.to_csv(f"./Big Data/Data Kalapa/SourceCode/KichBanA_Result.csv",header = boolean,index = 0,mode = "a")
def Training(estimator,model_name,data_name,trainingSet,devSet,boolean = False):
  print(f"==================={model_name}===================")
  model = estimator.fit(trainingSet)
  Evaluate(model,model_name,data_name,trainingSet,devSet,boolean = boolean)
  return model

/content/drive/.shortcut-targets-by-id/1jpmnPqYOybAHB2aghnIs2YA5gydXzH5o/Big Data/Data Kalapa/SourceCode


In [None]:
DTC = DecisionTreeClassifier(labelCol="label_indexed")
RFC = RandomForestClassifier(labelCol="label_indexed")
GBTC = GBTClassifier(maxIter = 5,labelCol="label_indexed")

# Experiment with 3 Datasets

In [None]:
def FeatureInfo(f):
  print("Num of impo features: {}".format(len(f.indices)))
  print(f.indices)

  f_filter = [i for i in f.indices if f[int(i)] > 1/len(f.indices)]
  print("Num of filtered1 features: {}".format(len(f_filter)))
  print(f_filter)
  f_filter = [i for i in f.indices if f[int(i)] > 2/len(f.indices)]
  print("Num of filtered2 features: {}".format(len(f_filter)))
  print(f_filter)

  f_filter = [i for i in f.indices if f[int(i)] > 3/len(f.indices)]
  print("Num of filtered3 features: {}".format(len(f_filter)))
  print(f_filter)

  f_filter = [i for i in f.indices if f[int(i)] > 4/len(f.indices)]
  print("Num of filtered4 features: {}".format(len(f_filter)))
  print(f_filter)

  f_filter = [i for i in f.indices if f[int(i)] > 5/len(f.indices)]
  print("Num of filtered5 features: {}".format(len(f_filter)))
  print(f_filter)

## Original Dataset (**DATASET 1**)

In [None]:
df1 = df.select(df.columns[:117])
print("NumOfColumn: {}".format(len(df1.columns)))

pip1_transform = CreatedPipeline(df1,add_cols = ["Field_82"])
trans_df1_ = pip1_transform.fit(df1)
trans_df1_.write().save("./Big Data/Data Kalapa/SourceCode/Pipeline1A")
trans_df1 = trans_df1_.transform(df1) 
trainingSet1, devSet1 = trans_df1.randomSplit([.9,.1],seed = 2021)

#display(trainingSet1.select("features").first())
#input_dim = len(trainingSet1.select("features").first()[0])
#print("Input dim: ",input_dim)

NumOfColumn: 117


In [None]:
model11 = Training(DTC,"Decision Tree","Dataset1a",trainingSet1,devSet1,boolean = True)
f11 = model11.featureImportances
FeatureInfo(f11)

model21 = Training(RFC,"Random Forest","Dataset1a",trainingSet1,devSet1)
f21 = model21.featureImportances
FeatureInfo(f21)

model31 = Training(GBTC,"Gradient Boosted Trees","Dataset1a",trainingSet1,devSet1)
f31 = model31.featureImportances
FeatureInfo(f31)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset1a,Decision Tree,0.718271,0.593515,0.706808,0.586967,0.025424


Num of impo features: 15
[    0    28 63635 63734 63737 63738 63755 63756 63759 63769 63771 63772
 63773 63795 63815]
Num of filtered1 features: 2
[63755, 63756]
Num of filtered2 features: 2
[63755, 63756]
Num of filtered3 features: 1
[63755]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset1a,Random Forest,0.675411,0.5,0.337706,0.403132,-0.125915


Num of impo features: 168
[    0     6     7     9    13    16    17    20    30    32  2846  2970
  6256  6532  6540  6556  6581  6592  6682  7191  9839  9886  9899 10260
 10262 10291 10719 10805 11198 11221 11608 11698 11877 12002 12402 13137
 13175 16107 17918 19306 20839 20967 21127 21406 21533 21650 22659 23674
 24065 24222 25965 29813 33240 33301 33386 35047 35064 35065 35083 35359
 35458 35785 39519 39531 39742 40061 40267 40468 40626 40808 42308 42552
 43967 43983 44188 44193 44197 44198 45534 45570 45582 45624 45625 45714
 45795 45805 46525 47077 47175 47193 48178 48221 48681 48794 52427 55512
 56525 56536 56646 57999 58741 59274 59508 59534 59548 59563 59673 59763
 59957 60417 60510 60543 60555 60577 60668 60727 60835 61097 61367 61468
 61473 61507 61657 61730 61851 61945 62514 62552 62557 62584 62788 63136
 63588 63589 63691 63734 63735 63737 63740 63741 63742 63745 63748 63754
 63755 63758 63760 63765 63766 63768 63769 63771 63775 63776 63777 63778
 63779 63781 63786 63796 

Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset1a,Gradient Boosted Trees,0.721042,0.597932,0.711394,0.593136,0.034428


Num of impo features: 71
[    0    11    12    20    22    23    28    31    32  7128  9888  9893
  9913  9916 18170 35062 35088 44439 45533 45535 45577 56527 56568 59545
 59568 61662 61871 62614 63635 63734 63736 63737 63738 63740 63741 63743
 63745 63748 63754 63755 63756 63758 63759 63764 63767 63769 63771 63772
 63773 63781 63782 63785 63786 63787 63791 63793 63794 63795 63796 63797
 63798 63799 63800 63803 63810 63811 63812 63814 63815 63816 63817]
Num of filtered1 features: 12
[0, 63734, 63736, 63738, 63740, 63743, 63755, 63756, 63772, 63795, 63811, 63816]
Num of filtered2 features: 7
[0, 63736, 63738, 63740, 63755, 63756, 63795]
Num of filtered3 features: 5
[63736, 63738, 63740, 63755, 63756]


## Add addition cols is seperated from timestamp (**DATASET 2**)

In [None]:
df2 = df.select(df.columns[:184])
print("NumOfColumn: {}".format(len(df2.columns)))

pip2_transform = CreatedPipeline(df2)
trans_df2_ = pip2_transform.fit(df2)
trans_df2_.write().save("./Big Data/Data Kalapa/SourceCode/Pipeline2A")
trans_df2 = trans_df2_.transform(df2)
trainingSet2, devSet2 = trans_df2.randomSplit([.9,.1],seed = 2021)

#display(trainingSet2.select("features").first())
#input_dim = len(trainingSet2.select("features").first()[0])
#print("Input dim: ",input_dim)

NumOfColumn: 184


In [None]:
model12 = Training(DTC,"Decision Tree","Dataset2a",trainingSet2,devSet2)
f12 = model12.featureImportances
FeatureInfo(f12)

model22 = Training(RFC,"Random Forest","Dataset2a",trainingSet2,devSet2)
f22 = model22.featureImportances
FeatureInfo(f22)

model32 = Training(GBTC,"Gradient Boosted Trees","Dataset2a",trainingSet2,devSet2)
f32 = model32.featureImportances
FeatureInfo(f32)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset2a,Decision Tree,0.716978,0.602905,0.6888,0.602488,0.031269


Num of impo features: 15
[   16    44 63651 63750 63753 63754 63783 63784 63811 63813 63831 63853
 63855 63873 63896]
Num of filtered1 features: 2
[63783, 63784]
Num of filtered2 features: 2
[63783, 63784]
Num of filtered3 features: 1
[63783]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset2a,Random Forest,0.675411,0.5,0.337706,0.403132,-0.125915


Num of impo features: 207
[    6     7     8    13    14    16    17    20    30    32    39    44
    77   100   462   822  3008  3202  4192  5281  5793  6075  6548  6565
  6576  6700  6826  7674  9337  9858  9924  9958 10131 10162 10285 10689
 11577 13304 13326 13805 14040 14665 16396 18079 18780 20845 21001 21088
 21812 21923 22256 22539 23494 23774 29477 30524 33502 33560 35081 35220
 35291 35345 36048 36573 38529 38684 39215 39538 39545 39554 39626 39655
 39662 39857 41740 42619 42624 43185 43940 43958 43983 43990 44010 44019
 44022 44047 44204 44219 44226 44315 44474 44898 45547 45593 45594 45596
 45600 45641 45820 46024 46434 46573 47015 47400 47899 48440 49959 50753
 51613 52552 53595 56479 56539 56554 56577 56584 58472 58757 59156 59536
 59572 59578 59642 59685 59834 60160 60383 60384 60398 60432 60436 60453
 60475 60510 60674 60703 61057 61120 61325 61328 61459 61460 61520 61612
 61641 61661 61691 61724 61839 61879 61905 61975 62213 62618 62658 62727
 62861 63152 63202 63216 

Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset2a,Gradient Boosted Trees,0.720303,0.601967,0.701737,0.599949,0.032248


Num of impo features: 82
[    0     7    12    13    15    16    25    35    36    38    39    44
  6574  6599  6609  7144  9860  9904  9924  9932  9938  9986 21056 35104
 43984 44455 56584 60432 61678 61704 61726 62576 63651 63750 63752 63753
 63754 63756 63757 63759 63782 63783 63784 63805 63808 63811 63813 63817
 63821 63825 63826 63828 63830 63831 63834 63836 63837 63843 63845 63846
 63847 63851 63852 63853 63855 63858 63864 63865 63866 63867 63870 63873
 63875 63880 63882 63884 63886 63891 63892 63893 63895 63896]
Num of filtered1 features: 13
[15, 16, 44, 63750, 63752, 63754, 63756, 63783, 63784, 63830, 63831, 63873, 63882]
Num of filtered2 features: 9
[15, 16, 63752, 63754, 63756, 63783, 63784, 63830, 63873]
Num of filtered3 features: 6
[16, 63752, 63754, 63756, 63783, 63784]


## Add all addition cols (**DATASET 3**)

In [None]:
df3 = df
print("NumOfColumn: {}".format(len(df.columns)))
pip3_transform = CreatedPipeline(df3)
trans_df3_ = pip3_transform.fit(df3)
trans_df3_.write().save("./Big Data/Data Kalapa/SourceCode/Pipeline3A")
trans_df3 = trans_df3_.transform(df3)
trainingSet3, devSet3 = trans_df3.randomSplit([.9,.1],seed = 2021)

#display(trainingSet3.select("features").first())
#input_dim = len(trainingSet3.select("features").first()[0])
#print("Input dim: ",input_dim)

NumOfColumn: 196


In [None]:
model13 = Training(DTC,"Decision Tree","Dataset3a",trainingSet3,devSet3)
f13 = model13.featureImportances
FeatureInfo(f13)

model23 = Training(RFC,"Random Forest","Dataset3a",trainingSet3,devSet3)
f23 = model23.featureImportances
FeatureInfo(f23)

model33 = Training(GBTC,"Gradient Boosted Trees","Dataset3a",trainingSet3,devSet3)
f33 = model33.featureImportances
FeatureInfo(f33)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset3a,Decision Tree,0.716978,0.602905,0.6888,0.602488,0.031269


Num of impo features: 15
[   19    47 63654 63753 63756 63757 65517 65518 65548 65550 65568 65592
 65594 65612 65638]
Num of filtered1 features: 2
[65517, 65518]
Num of filtered2 features: 2
[65517, 65518]
Num of filtered3 features: 1
[65517]


Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset3a,Random Forest,0.675411,0.5,0.337706,0.403132,-0.125915


Num of impo features: 217
[    0     3     5     6     7    16    17    19    23    26    28    30
    32    37    43    45    47    51    57   351   775  1628  2348  2604
  2628  3161  3519  3581  4672  6565  6579  6670  6813  8258  9858  9863
  9911  9925  9956 10077 10207 10472 10617 11431 11725 11959 12168 12500
 12598 12921 13330 13821 17411 18551 20984 21243 21491 21492 22477 22619
 22797 23177 23656 23894 24643 24677 24780 25028 25753 27609 29854 30342
 30860 31348 33329 34281 35097 35115 35230 35253 35633 35766 35820 36238
 36410 36496 36798 37485 39111 39579 39941 39955 42207 42670 43153 43944
 43986 43987 44014 44043 44624 45551 45555 45557 45561 45562 45592 45596
 45600 45606 45634 45685 45713 45737 45811 46110 47422 47460 49864 50757
 51550 53215 53829 54892 59527 59603 59752 59836 60060 60077 60387 60396
 60434 60436 60467 60548 60567 60665 60825 61160 61182 61258 61461 61462
 61463 61469 61471 61474 61491 61662 61667 61673 61789 61812 61848 62146
 62187 62533 62534 62573 

Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Dataset3a,Gradient Boosted Trees,0.719195,0.600998,0.698833,0.598829,0.032475


Num of impo features: 80
[    0     1     7    12    13    15    19    28    38    39    41    42
    47   113  6560  6575  6612  7147  9907  9918  9935 12283 35094 35107
 43987 44212 45632 56587 61681 61707 61729 61732 63654 63753 63755 63756
 63757 63759 63760 63762 63764 65502 65503 65516 65517 65518 65545 65548
 65550 65558 65562 65565 65567 65568 65569 65571 65573 65574 65580 65582
 65584 65586 65589 65590 65592 65594 65603 65605 65606 65609 65612 65613
 65620 65622 65625 65632 65633 65634 65637 65638]
Num of filtered1 features: 12
[15, 19, 47, 63753, 63755, 63757, 63759, 65517, 65518, 65567, 65612, 65622]
Num of filtered2 features: 8
[15, 19, 63755, 63757, 63759, 65517, 65518, 65612]
Num of filtered3 features: 5
[63755, 63757, 63759, 65517, 65518]


**Tiếp theo, thực hiện sử dụng thuộc tính rút trích để chạy model để thực hiện so sánh với TH gốc, các kịch bản vạch ra:**

*  Sử dụng bộ dataset cho f1_score kết quả tốt nhất (**DATASET2**)
*  Sử dụng tree-based tốt nhất để rút trích thuộc tính quan trọng (**GBT**)
  * Sử dụng toàn phần thuộc tính quan trọng
  * Sử dụng một phần thuộc tính quan trọng
* Chạy lại 3 model tree-based
* Thực hiện so sánh





========================================================================

# Model using impo features for DATASET2

**Full impo features**

In [None]:
def Convert1(x,f):
  idx_impo = {i: float(0.0) for i in f.indices}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert1 = f.udf(lambda x: Convert1(x,f32),VectorUDT())
trainingSet12 = trainingSet2.withColumn("features",convert1("features"))
devSet12 = devSet2.withColumn("features",convert1("features"))

In [None]:
model112 = Training(DTC,"Decision Tree","Fulla",trainingSet12,devSet12)
model122 = Training(RFC,"Random Forest","Fulla",trainingSet12,devSet12)
model132 = Training(GBTC,"Gradient Boosted Trees","Fulla",trainingSet12,devSet12)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Fulla,Decision Tree,0.716978,0.602905,0.6888,0.602488,0.031269




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Fulla,Random Forest,0.71199,0.575858,0.713031,0.55815,0.000219




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Fulla,Gradient Boosted Trees,0.720303,0.601967,0.701737,0.599949,0.032248


**A part of impo features (filtered1)**

In [None]:
def Convert2(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 1/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert2 = f.udf(lambda x: Convert2(x,f32),VectorUDT())
trainingSet22 = trainingSet2.withColumn("features",convert2("features"))
devSet22 = devSet2.withColumn("features",convert2("features"))

In [None]:
model212 = Training(DTC,"Decision Tree","Filtered2a",trainingSet22,devSet22)
model222 = Training(RFC,"Random Forest","Filtered2a",trainingSet22,devSet22)
model232 = Training(GBTC,"Gradient Boosted Trees","Filtered2a",trainingSet22,devSet22)

**A part of impo features (filtered2)**

In [None]:
def Convert3(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 2/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert3 = f.udf(lambda x: Convert3(x,f32),VectorUDT())
trainingSet32 = trainingSet2.withColumn("features",convert3("features"))
devSet32 = devSet2.withColumn("features",convert3("features"))

In [None]:
model312 = Training(DTC,"Decision Tree","Filtered2a",trainingSet32,devSet32)
model322 = Training(RFC,"Random Forest","Filtered2a",trainingSet32,devSet32)
model332 = Training(GBTC,"Gradient Boosted Trees","Filtered2a",trainingSet32,devSet32)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered1a,Decision Tree,0.719749,0.613677,0.687553,0.617167,0.043922




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered1a,Random Forest,0.718825,0.592447,0.711529,0.584852,0.022159




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered1a,Gradient Boosted Trees,0.721227,0.61285,0.692472,0.615752,0.051346


**A part of impo features (filtered3)**

In [None]:
def Convert4(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 3/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert4 = f.udf(lambda x: Convert4(x,f32),VectorUDT())
trainingSet42 = trainingSet2.withColumn("features",convert4("features"))
devSet42 = devSet2.withColumn("features",convert4("features"))

In [None]:
model412 = Training(DTC,"Decision Tree","Filtered3a",trainingSet42,devSet42)
model422 = Training(RFC,"Random Forest","Filtered3a",trainingSet42,devSet42)
model432 = Training(GBTC,"Gradient Boosted Trees","Filtered3a",trainingSet42,devSet42)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2a,Decision Tree,0.717717,0.604782,0.689294,0.605062,0.041823




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2a,Random Forest,0.716054,0.598082,0.690973,0.595393,0.022843




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered2a,Gradient Boosted Trees,0.720303,0.606845,0.695591,0.607377,0.038855


**A part of impo features (filtered4)**

In [None]:
def Convert5(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 4/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert5 = f.udf(lambda x: Convert4(x,f32),VectorUDT())
trainingSet52 = trainingSet2.withColumn("features",convert5("features"))
devSet52 = devSet2.withColumn("features",convert5("features"))

In [None]:
model512 = Training(DTC,"Decision Tree","Filtered4a",trainingSet52,devSet52)
model522 = Training(RFC,"Random Forest","Filtered4a",trainingSet52,devSet52)
model532 = Training(GBTC,"Gradient Boosted Trees","Filtered4a",trainingSet52,devSet52)

**A part of impo features (filtered5)**

In [None]:
def Convert6(x,f):
  idx_impo = {i: float(0.0) for i in f.indices if f[int(i)] > 5/len(f.indices)}
  for i in idx_impo.keys():
    if i in x.indices:
      idx_impo[i] = x[int(i)]
  values = list(idx_impo.values())
  return Vectors.dense(values)
convert6 = f.udf(lambda x: Convert6(x,f32),VectorUDT())
trainingSet62 = trainingSet2.withColumn("features",convert6("features"))
devSet62 = devSet2.withColumn("features",convert6("features"))

In [None]:
model612 = Training(DTC,"Decision Tree","Filtered5a",trainingSet62,devSet62)
model622 = Training(RFC,"Random Forest","Filtered5a",trainingSet62,devSet62)
model632 = Training(GBTC,"Gradient Boosted Trees","Filtered5a",trainingSet62,devSet62)



Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered4a,Decision Tree,0.699797,0.570527,0.662109,0.556345,-0.031539




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered4a,Random Forest,0.699797,0.570527,0.662109,0.556345,-0.031539




Unnamed: 0,dataset,model,accuracy,precision,recal,f1_score,gini_core
0,Filtered4a,Gradient Boosted Trees,0.699797,0.570527,0.662109,0.556345,-0.031539


========================================================================


In [None]:
model32.write().save("./Big Data/Data Kalapa/SourceCode/ModelA")

# Extract features importances
* **Khái niệm về sparse vector** <br>
Sparse vector sử dụng khi các weight của 1 vector bằng 0 nhiều<br>
Ví dụ: Ta có một vector 500 chiều, với chiều ở index 0 có giá trị 1.0, index 335 có giá trị 2.0, index = 499 có giá trị 3.0, **tất cả index còn lại có giá trị 0**<br>
=> ***Sparse Vector***: 
 * Dạng từ điển: (500,{1: 1.0, 335: 2.0, 499: 3.0}-**index: value**)
 * Dạng kiểu list: (500, [1, 335, 449]- **Danh sách index**, [1.0, 2.0, 3.0]-**Danh sách values/weights**)
* **Xử lý với các đặc trưng quan trọng có dạng sparse vector**
 * Đầu vào: (5000 , [28,33,449], [1.0,2.0,3.0])
 * Đặc trưng quan trọng: (500, [1,2,28,29,55],[0.01,0.02,0.37,0.4,0.2])
<b>Bước 1</b>: Tạo từ điển với các index trong "Đặc trưng quan trọng"<br>
Từ điển: {1: 0.0, 2: 0.0, 28: 0.0,29: 0.0,55: 0.0}<br>
<b>Bước 2</b>: Điền vào từ điển với các ***giá trị trong đầu vào*** nếu ***index của đầu vào có trong từ điển***<br>
Từ điển: {1: 0.0, 2: 0.0, **28: 1.0**, 29: 0.0, 55: 0.0}<br>
<b>Bước 3</b>: Tạo một Dense Vector từ các giá trị trong từ điển<br>
[0.0, 0.0,**1.0**,0.0,0.]<br>

==> Kết quả sau khi xử lý: [0.0, 0.0,**1.0**,0.0,0.]


