In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [3]:
import pandas as pd

In [4]:
import re

In [5]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField


In [6]:
from collections import OrderedDict


In [7]:
spark = SparkSession.builder\
        .appName("LogMining")\
        .master("local")\
        .config("spark.executor.cores","6")\
        .config("spark.executor.memory","2g")\
        .getOrCreate()

In [8]:
#StructField("EventId", StringType(), True),
#StructField("EventTemplate", StringType(), True)
# file = open(,"r")

In [9]:
from pyspark.sql import Row

In [10]:
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) (\S+) (\S+) (.*)'


In [11]:
logline = "081109 203518 35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906"

In [12]:
type(logline)

str

In [13]:
match = re.findall(APACHE_ACCESS_LOG_PATTERN, logline)


In [14]:
match[0]

('081109',
 '203518',
 '35',
 'INFO',
 'dfs.FSNamesystem:',
 'BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906')

In [15]:
def parse_hdfs_log_line(logline):
    match = re.findall(APACHE_ACCESS_LOG_PATTERN, logline)
    if match is None:
        raise Error("Invalid logline: %s" % logline)
    return Row(
        Date = match[0][0],
        Time = match[0][1],
        Pid = match[0][2],
        Level = match[0][3],
        Component = match[0][4],
        Content = match[0][5])

In [16]:
abc = parse_hdfs_log_line(logline)

In [17]:
s = "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.73.220:50010 is added to blk_7128370237687728475 size 67108864"
s = re.sub("[\d.-]+", "<*>", s)
print(s)

BLOCK* NameSystem<*>addStoredBlock: blockMap updated: <*>:<*> is added to blk_<*> size <*>


In [18]:
# fo = open("abc.txt", "r")
# file_line = fo.readlines()

In [19]:
# file_line

In [20]:
# file_line[1]

In [21]:
# customSchema = StructType([
#     StructField("Date", IntegerType(), True),
#     StructField("Time", IntegerType(), True),
#     StructField("Pid", IntegerType(), True),
#     StructField("Pid", IntegerType(), True),
#     StructField("Component", StringType(), True),
#     StructField("Content", StringType(), True),
# #     StructField("EventId", StringType(), True),
# #     StructField("EventTemplate", StringType(), True)
# ])

In [22]:
import hashlib

In [23]:
def parse_hdfs_file(file):
    ab = []
    file_line = file.readlines()
    for line in range(len(file_line)):
        match = re.findall(APACHE_ACCESS_LOG_PATTERN, file_line[line])
        for mat in range(len(match)):
            bd = re.sub("[\d.-]+", "<*>", match[mat][5])
            eventid = hashlib.md5(' '.join(bd).encode('utf-8')).hexdigest()[0:8]
            ab.append(Row(Date =  match[mat][0],Time = match[mat][1],Pid =match[mat][2],Level =  match[mat][3],Component   = match[mat][4],Content  = match[mat][5], EventTemplate=bd, EventId =eventid ))
    return ab

In [24]:
fo = open("Train.log", "r")

In [25]:
parsed_file = parse_hdfs_file(fo)

In [26]:
parsed_file

[Row(Component='dfs.DataNode$DataXceiver:', Content='Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010', Date='081109', EventId='2ea739e7', EventTemplate='Receiving block blk_<*> src: /<*>:<*> dest: /<*>:<*>', Level='INFO', Pid='143', Time='203518'),
 Row(Component='dfs.FSNamesystem:', Content='BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906', Date='081109', EventId='8b9dee11', EventTemplate='BLOCK* NameSystem<*>allocateBlock: /mnt/hadoop/mapred/system/job_<*>_<*>/job<*>jar<*> blk_<*>', Level='INFO', Pid='35', Time='203518'),
 Row(Component='dfs.DataNode$DataXceiver:', Content='Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010', Date='081109', EventId='2ea739e7', EventTemplate='Receiving block blk_<*> src: /<*>:<*> dest: /<*>:<*>', Level='INFO', Pid='143', Time='203519'),
 Row(Component='dfs.DataNode$DataXceiver:', Content='Receiving blo

In [27]:
parsed_file[1][1]

'BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906'

In [28]:
# parsed_file = []
# for x in parsed_file[]:
#     if x not in output:
#         output.append(x)
# print (output)

In [33]:
customSchema = StructType([
    StructField("Date", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Pid", StringType(), True),
    StructField("Level", StringType(), True),
    StructField("Component", StringType(), True),
    StructField("Content", StringType(), True),
    StructField("EventTemplate", StringType(), True),
    StructField("EventId", StringType(), True)
    
])

In [34]:
df_train = spark.createDataFrame(parsed_file,schema = customSchema)

In [35]:
df_train.show(5)

+------+------+---+-----+--------------------+--------------------+--------------------+--------+
|  Date|  Time|Pid|Level|           Component|             Content|       EventTemplate| EventId|
+------+------+---+-----+--------------------+--------------------+--------------------+--------+
|081109|203518|143| INFO|dfs.DataNode$Data...|Receiving block b...|Receiving block b...|2ea739e7|
|081109|203518| 35| INFO|   dfs.FSNamesystem:|BLOCK* NameSystem...|BLOCK* NameSystem...|8b9dee11|
|081109|203519|143| INFO|dfs.DataNode$Data...|Receiving block b...|Receiving block b...|2ea739e7|
|081109|203519|145| INFO|dfs.DataNode$Data...|Receiving block b...|Receiving block b...|2ea739e7|
|081109|203519|145| INFO|dfs.DataNode$Pack...|PacketResponder 1...|PacketResponder <...|fd96e398|
+------+------+---+-----+--------------------+--------------------+--------------------+--------+
only showing top 5 rows



In [36]:
df_train.toPandas()['EventId'].unique()

array(['2ea739e7', '8b9dee11', 'fd96e398', 'a7fb7c2e', '1defe93d',
       '66a23a40', '1cb650a2', '6c9db9d3', '0193e112', '6bcaef6a',
       '62adf7b3', '0ec49ccc', '159ab19e', 'e20ddac9', 'c49fb6f1',
       '133547b1'], dtype=object)

In [104]:
# def append_event(read_df):
    
#     for i in len(df_train):
#         parsed_file[i]['EventTemplate'] = re.sub("[\d.-]+", "<*>", parsed_file[i][1])
        
        

In [105]:
# s = "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.73.220:50010 is added to blk_7128370237687728475 size 67108864"
# s = re.sub("[\d.-]+", "<*>", s)
# print(s)

In [39]:
# content = "Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010"


In [40]:
#  def generate_logformat_regex(logformat):
#     ''' 
#     Function to generate regular expression to split log messages
#     '''
#     headers = []
#     splitters = re.split(r'(<[^<>]+>)', logformat)
#     regex = ''
#     for k in range(len(splitters)):
#         if k % 2 == 0:
#             splitter = re.sub(' +', '\s+', splitters[k])
#             regex += splitter
#         else:
#             header = splitters[k].strip('<').strip('>')
#             regex += '(?P<%s>.*?)' % header
#             headers.append(header)
#     regex = re.compile('^' + regex + '$')
#     return headers, regex

In [41]:
# names = df_train.schema.Content


In [42]:
# rex = r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?' 
# lil = re.sub(rex,'<*>', content)


In [43]:
# generate_logformat_regex(content)

In [44]:
 def read_file(data):
    #struct_log = spark.read.format("csv").option("header", "true").load(file)
    data_dict = OrderedDict()
    for row in data.collect():  
        blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
        blkId_set = set(blkId_list)
        for blk_Id in blkId_set:
            if not blk_Id in data_dict:
                data_dict[blk_Id] = []
            data_dict[blk_Id].append(row['EventId'])
    data_df = spark.createDataFrame(list(data_dict.items()), schema=['BlockId', 'EventSequence'])
    
    return data_df

LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate


In [45]:
df_train_append = read_file(df_train)

In [46]:
df_train_append.show(10)

+--------------------+--------------------+
|             BlockId|       EventSequence|
+--------------------+--------------------+
|blk_-160899968791...|[2ea739e7, 8b9dee...|
|blk_7503483334202...|[2ea739e7, 2ea739...|
|blk_-354458337728...|[2ea739e7, 62adf7...|
|blk_-907399258668...|[2ea739e7, 159ab1...|
|blk_7854771516489...|[2ea739e7, 2ea739...|
|blk_1717858812220...|[2ea739e7, 2ea739...|
|blk_-251961732037...|[2ea739e7, e20dda...|
|blk_7063315473424...|[2ea739e7, 2ea739...|
|blk_8586544123689...|[2ea739e7, 2ea739...|
|blk_2765344736980...|[2ea739e7, 2ea739...|
+--------------------+--------------------+
only showing top 10 rows



In [47]:
# abc_log = spark.read.format("csv").option("header", "false").schema(customSchema).load("log.csv")

In [48]:
# abc_log.show(5)

In [49]:
# ab = read_file("log.csv")

In [50]:
# type(ab)

In [51]:
# ab.head(10)

In [52]:
# schema 

In [53]:
# ab = read_file("log.csv")

In [54]:
# type(ab)

In [55]:
# ab.show(10)

In [56]:
label_csv = spark.read.format("csv").option("header", "true").load("anomaly_label.csv")

In [57]:
label_csv.show(10)

+--------------------+-------+
|             BlockId|  Label|
+--------------------+-------+
|blk_-160899968791...| Normal|
|blk_7503483334202...| Normal|
|blk_-354458337728...|Anomaly|
|blk_-907399258668...| Normal|
|blk_7854771516489...| Normal|
|blk_1717858812220...| Normal|
|blk_-251961732037...| Normal|
|blk_7063315473424...| Normal|
|blk_8586544123689...| Normal|
|blk_2765344736980...| Normal|
+--------------------+-------+
only showing top 10 rows



In [58]:
# label_data = label_csv.set_index('BlockId')
label_data =label_csv.toPandas().set_index('BlockId')
label_dict = label_data['Label'].to_dict()

In [59]:
label_dict

{'blk_-1608999687919862906': 'Normal',
 'blk_7503483334202473044': 'Normal',
 'blk_-3544583377289625738': 'Anomaly',
 'blk_-9073992586687739851': 'Normal',
 'blk_7854771516489510256': 'Normal',
 'blk_1717858812220360316': 'Normal',
 'blk_-2519617320378473615': 'Normal',
 'blk_7063315473424667801': 'Normal',
 'blk_8586544123689943463': 'Normal',
 'blk_2765344736980045501': 'Normal',
 'blk_-2900490557492272760': 'Normal',
 'blk_-50273257731426871': 'Normal',
 'blk_4394112519745907149': 'Normal',
 'blk_3640100967125688321': 'Normal',
 'blk_-40115644493265216': 'Normal',
 'blk_-8531310335568756456': 'Anomaly',
 'blk_-3409923645141256069': 'Normal',
 'blk_3974948352784823938': 'Normal',
 'blk_5647760196018207394': 'Normal',
 'blk_-202775138379690649': 'Normal',
 'blk_-8120118743897862909': 'Normal',
 'blk_8376667364205250596': 'Normal',
 'blk_-1942808544656255720': 'Normal',
 'blk_3947106522258141922': 'Anomaly',
 'blk_-4513615671112005170': 'Normal',
 'blk_5703440264997337028': 'Normal',
 

In [64]:
# def udf_anomaly(abc):
#         if label_dict[abc] == 'Anomaly':
#             return 1
#         else:
#             return 0

In [65]:
# df_train_append.select("BlockId").rdd.map(lambda x: 1 if label_dict[x] == 'Anomaly' else 0).toDF()


In [66]:
# df_train_append.withColumn("label", udf_anomaly(df_train_append['BlockId']))

In [67]:
df_train_append_pd = df_train_append.toPandas()

In [68]:
df_train_append_pd

Unnamed: 0,BlockId,EventSequence
0,blk_-1608999687919862906,"[2ea739e7, 8b9dee11, 2ea739e7, 2ea739e7, fd96e..."
1,blk_7503483334202473044,"[2ea739e7, 2ea739e7, 66a23a40, 2ea739e7, fd96e..."
2,blk_-3544583377289625738,"[2ea739e7, 62adf7b3, 2ea739e7, 2ea739e7, fd96e..."
3,blk_-9073992586687739851,"[2ea739e7, 159ab19e, 2ea739e7, 2ea739e7, fd96e..."
4,blk_7854771516489510256,"[2ea739e7, 2ea739e7, e20ddac9, 2ea739e7]"
5,blk_1717858812220360316,"[2ea739e7, 2ea739e7, e20ddac9]"
6,blk_-2519617320378473615,"[2ea739e7, e20ddac9, 2ea739e7]"
7,blk_7063315473424667801,"[2ea739e7, 2ea739e7, 2ea739e7, e20ddac9]"
8,blk_8586544123689943463,"[2ea739e7, 2ea739e7, 2ea739e7, e20ddac9]"
9,blk_2765344736980045501,"[2ea739e7, 2ea739e7, e20ddac9, 2ea739e7]"


In [69]:
df_train_append_pd['Label'] = df_train_append_pd['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

In [70]:
df_event = spark.createDataFrame(df_train_append_pd)

In [71]:
df_event.show(10)

+--------------------+--------------------+-----+
|             BlockId|       EventSequence|Label|
+--------------------+--------------------+-----+
|blk_-160899968791...|[2ea739e7, 8b9dee...|    0|
|blk_7503483334202...|[2ea739e7, 2ea739...|    0|
|blk_-354458337728...|[2ea739e7, 62adf7...|    1|
|blk_-907399258668...|[2ea739e7, 159ab1...|    0|
|blk_7854771516489...|[2ea739e7, 2ea739...|    0|
|blk_1717858812220...|[2ea739e7, 2ea739...|    0|
|blk_-251961732037...|[2ea739e7, e20dda...|    0|
|blk_7063315473424...|[2ea739e7, 2ea739...|    0|
|blk_8586544123689...|[2ea739e7, 2ea739...|    0|
|blk_2765344736980...|[2ea739e7, 2ea739...|    0|
+--------------------+--------------------+-----+
only showing top 10 rows



In [72]:
# from pyspark.sql import Row
# from pyspark.sql.types import *
# from pyspark.mllib.linalg import DenseVector
# from pyspark.ml.feature import StringIndexer, VectorIndexer
# from pyspark.ml.classification import RandomForestClassifier
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# from pyspark.sql.functions import *

In [73]:
# inputData = df_event.rdd.map(lambda x:(x[2],DenseVector(x[1])))

In [74]:
# type(inputData)

In [75]:
# dataFrameSplit=inputData.toDF(["label","features"])

In [76]:
# inputData = df_event.map(lambda x:(x[0],DenseVector(x[1:])))
# dataFrameSplit=inputData.toDF(["label","features"])
# dataFrameSplit=dataFrameSplit.withColumn("label",dataFrameSplit["label"]).cast(DoubleType())

# train,test =dataFrameSplit.randomSplit([.8,.2])
# labelIndexer=StringIndexer(inputCol="label",outputCol="indexedLabel").fit(dataFrameSplit)
# randomForestClassifier =RandomForestClassifier(labelCol="indexedLabel",featuresCol="features")
# rfModel = randomForestClassifier.fit(labelIndexer.transform())
# predictions = rfModel.transform(test)
# evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision")
# accuracy = evaluator.evaluate(predictions)
# print("Accuracy = %g" % (1.0-accuracy))

In [78]:
# df_train_append['Label'] = df_train_append['BlockId'].flatmap(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

In [None]:
# (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, data_df['Label'].values, train_ratio, split_type)

In [None]:
# label_data = pd.read_csv("anomaly_label.csv", engine='c', na_filter=False, memory_map=True)
# label_data = label_data.set_index('BlockId')
# label_dict = label_data['Label'].to_dict()
# ab.select[ab['BlockId'].rdd.flatmap(lambda x: 1 if label_dict[x] == 'Anomaly' else 0).toDF()
# # df.select("_c0").rdd.flatMap(lambda x: x + ("anything", )).toDF()

# # # Split trgiain and test data
# # (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values, data_df['Label'].values, train_ratio, split_type)

In [79]:
from sklearn.model_selection import train_test_split

In [80]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

In [81]:
x_train, x_test, y_train, y_test = train_test_split(df_train_append_pd['EventSequence'].values, df_train_append_pd['Label'].values, test_size = 0.2) 

In [82]:
# model = DecisionTree.trainClassifier(x_train, numClasses=2, categoricalFeaturesInfo={},
#                                      impurity='gini', maxDepth=5, maxBins=32)

# # Evaluate model on test instances and compute test error
# predictions = model.predict(x_test)
# labelsAndPredictions = y_test.zip(predictions)

In [83]:
# train_y, test_y = label_csv.randomSplit([0.8, 0.2], seed=12345)

In [84]:
# train_x.show(10)

In [85]:
# from pyspark.mllib.feature import HashingTF
# from pyspark.mllib.feature import IDF

In [86]:
# hashingTF = HashingTF(100000)
# tf = hashingTF.transform(documents)

In [87]:
# tf.cache()
# idf = IDF(minDocFreq=1).fit(tf)

In [88]:
# labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# # Automatically identify categorical features, and index them.
# # We specify maxCategories so features with > 4 distinct values are treated as continuous.
# featureIndexer =\
#     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# # Split the data into training and test sets (30% held out for testing)
# (trainingData, testData) = data.randomSplit([0.7, 0.3])

# # Train a DecisionTree model.
# dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# # Chain indexers and tree in a Pipeline
# pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# # Train model.  This also runs the indexers.
# model = pipeline.fit(trainingData)

# # Make predictions.
# predictions = model.transform(testData)

# # Select example rows to display.
# predictions.select("prediction", "indexedLabel", "features").show(5)

# # Select (prediction, true label) and compute test error
# evaluator = MulticlassClassificationEvaluator(
#     labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
# accuracy = evaluator.evaluate(predictions)
# print "Test Error = %g" % (1.0 - accuracy)

# treeModel = model.stages[2]

In [89]:
import pandas as pd
import os
import numpy as np
import re
import sys
from collections import Counter
from scipy.special import expit
from itertools import compress

In [90]:
class FeatureExtractor(object):

    def __init__(self):
        self.idf_vec = None
        self.mean_vec = None
        self.events = None
        self.term_weighting = None
        self.normalization = None
        self.oov = None

    def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=False, min_count=1):
        print('====== Transformed train data summary ======')
        self.term_weighting = term_weighting
        self.normalization = normalization
        self.oov = oov

        X_counts = []
        for i in range(X_seq.shape[0]):
            event_counts = Counter(X_seq[i])
            X_counts.append(event_counts)
        X_df = pd.DataFrame(X_counts)
        X_df = X_df.fillna(0)
        self.events = X_df.columns
        X = X_df.values
        if self.oov:
            oov_vec = np.zeros(X.shape[0])
            if min_count > 1:
                idx = np.sum(X > 0, axis=0) >= min_count
                oov_vec = np.sum(X[:, ~idx] > 0, axis=1)
                X = X[:, idx]
                self.events = np.array(X_df.columns)[idx].tolist()
            X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
        
        num_instance, num_event = X.shape
        if self.term_weighting == 'tf-idf':
            df_vec = np.sum(X > 0, axis=0)
            self.idf_vec = np.log(num_instance / (df_vec + 1e-8))
            idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) 
            X = idf_matrix
        if self.normalization == 'zero-mean':
            mean_vec = X.mean(axis=0)
            self.mean_vec = mean_vec.reshape(1, num_event)
            X = X - np.tile(self.mean_vec, (num_instance, 1))
        elif self.normalization == 'sigmoid':
            X[X != 0] = expit(X[X != 0])
        X_new = X
        
        print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) 
        return X_new

    def transform(self, X_seq):
        print('====== Transformed test data summary ======')
        X_counts = []
        for i in range(X_seq.shape[0]):
            event_counts = Counter(X_seq[i])
            X_counts.append(event_counts)
        X_df = pd.DataFrame(X_counts)
        X_df = X_df.fillna(0)
        empty_events = set(self.events) - set(X_df.columns)
        for event in empty_events:
            X_df[event] = [0] * len(X_df)
        X = X_df[self.events].values
        if self.oov:
            oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1)
            X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
        
        num_instance, num_event = X.shape
        if self.term_weighting == 'tf-idf':
            idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) 
            X = idf_matrix
        if self.normalization == 'zero-mean':
            X = X - np.tile(self.mean_vec, (num_instance, 1))
        elif self.normalization == 'sigmoid':
            X[X != 0] = expit(X[X != 0])
        X_new = X

        print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) 

        return X_new


In [91]:
# x_train = df_train_append_pd['EventSequence']

In [92]:
feature_extractor = FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf')
x_test = feature_extractor.transform(x_test)


Train data shape: 61-by-15

Test data shape: 16-by-15



In [93]:
# type(x_test)

In [94]:
# model = DecisionTree.trainClassifier(x_train, numClasses=2, categoricalFeaturesInfo={},
#                                      impurity='gini', maxDepth=5, maxBins=32)


In [95]:
# from pyspark.ml.classification import LogisticRegression
# lr = LogisticRegression(featuresCol = 'x_train', labelCol = 'y_train', maxIter=10)
# lrModel = lr.fit(x_test)

In [96]:
# x_test

In [97]:
# dt = DecisionTreeClassifier(labelCol="y_train", featuresCol="x_train")


In [98]:
# dt

In [99]:
# model = dt.fit(x_test)



In [100]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier(max_depth=1, random_state=42)
rfclf = rfclf.fit(x_train, y_train)

from sklearn.metrics import f1_score

predicted_y = rfclf.predict(x_test)
print("test accuracy: ",f1_score(y_test, predicted_y, average='micro'))

test accuracy:  0.9375




In [101]:
predicted_y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [103]:
from sklearn.externals import joblib  

joblib.dump(rfclf, "nlp-trained-model.pkl")

['nlp-trained-model.pkl']