## Machine Learning

In this section the goal is to put all tweets in the space and find the best classifying hyperplane. 

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Python Spark Logistic Regression example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [101]:
data = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("feats2.csv")

# data.cache() # Cache data for faster reuse

In [26]:
data.select(data['f1'], data['labels']).show()

+---+------+
| f1|labels|
+---+------+
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
|  0|     1|
+---+------+
only showing top 20 rows



In [17]:
data.printSchema()

root
 |-- f1: integer (nullable = true)
 |-- f2: integer (nullable = true)
 |-- f3: integer (nullable = true)
 |-- f4: integer (nullable = true)
 |-- f5: integer (nullable = true)
 |-- f6: integer (nullable = true)
 |-- f7: integer (nullable = true)
 |-- f8: integer (nullable = true)
 |-- f9: integer (nullable = true)
 |-- f10: integer (nullable = true)
 |-- f11: integer (nullable = true)
 |-- f12: integer (nullable = true)
 |-- f13: integer (nullable = true)
 |-- f14: integer (nullable = true)
 |-- f15: integer (nullable = true)
 |-- f16: integer (nullable = true)
 |-- f17: integer (nullable = true)
 |-- f18: integer (nullable = true)
 |-- f19: integer (nullable = true)
 |-- f20: integer (nullable = true)
 |-- f21: integer (nullable = true)
 |-- f22: integer (nullable = true)
 |-- f23: integer (nullable = true)
 |-- f24: integer (nullable = true)
 |-- f25: integer (nullable = true)
 |-- f26: integer (nullable = true)
 |-- f27: integer (nullable = true)
 |-- f28: integer (nullable = tr

In [80]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

features = ('f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','f20','f21','f22','f23','f24','f25','f26','f27','f28','f29','f30','f31','f32','f33','f34','f35','f36','f37','f38','f39','f40','f41','f42','f43','f44','f45','f46','f47','f48','f49','f50','f51','f52','f53','f54','f55','f56','f57','f58','f59','f60','f61','f62','f63','f64','f65','f66','f67','f68','f69','f70','f71','f72','f73','f74','f75','f76','f77','f78','f79','f80','f81','f82','f83','f84','f85','f86','f87','f88','f89','f90','f91','f92','f93','f94','f95','f96','f97','f98','f99','f100','f101','f102','f103','f104','f105','f106','f107','f108','f109','f110','f111','f112','f113','f114','f115','f116','f117','f118','f119','f120','f121','f122','f123','f124','f125','f126','f127','f128','f129','f130','f131','f132','f133','f134','f135','f136','f137','f138','f139','f140','f141','f142','f143','f144','f145','f146','f147','f148','f149','f150','f151','f152','f153','f154','f155','f156','f157','f158','f159','f160','f161','f162','f163','f164','f165','f166','f167','f168','f169','f170','f171','f172','f173','f174','f175','f176','f177','f178','f179','f180','f181','f182','f183','f184','f185','f186','f187','f188','f189','f190','f191','f192','f193','f194','f195','f196','f197','f198','f199','f200','f201','f202','f203','f204','f205','f206','f207','f208','f209','f210','f211','f212','f213','f214','f215','f216','f217','f218','f219','f220','f221','f222','f223','f224','f225','f226','f227','f228','f229','f230','f231','f232','f233','f234','f235','f236','f237','f238','f239','f240','f241','f242','f243','f244','f245','f246','f247','f248','f249','f250','f251','f252','f253','f254','f255','f256','f257','f258','f259','f260','f261','f262','f263','f264','f265','f266','f267','f268','f269','f270','f271','f272','f273','f274','f275','f276','f277','f278','f279','f280','f281','f282','f283','f284','f285','f286','f287','f288','f289','f290','f291','f292','f293','f294','f295','f296','f297','f298','f299','f300','f301','f302','f303','f304','f305','f306','f307','f308','f309','f310','f311','f312','f313','f314','f315','f316','f317','f318','f319','f320','f321','f322','f323','f324','f325','f326','f327','f328','f329','f330','f331')

assembler = VectorAssembler(inputCols=features,outputCol="features")
 
raw_data=assembler.transform(data)


In [88]:
train, test = raw_data.randomSplit([0.7, 0.3])

In [82]:
test = raw_data.select(raw_data['features'], raw_data['labels']).show(5)

+--------------------+------+
|            features|labels|
+--------------------+------+
|(331,[38,46,88,97...|     1|
|(331,[32,41,46,66...|     1|
|(331,[86,230],[1....|     1|
|(331,[13,17,83,20...|     1|
|(331,[68,99,109,1...|     1|
+--------------------+------+
only showing top 5 rows



In [89]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="labels", featuresCol="features",maxIter=10)
model=lr.fit(train)

In [84]:
print("coefficients: " + str(model.coefficientMatrix))

coefficients: DenseMatrix([[ -2.37932653e-02,   2.44601633e+00,  -1.61262932e+00,
                1.63918599e+00,  -9.14891148e-01,  -1.03858103e+00,
                1.06852549e+00,   2.94475783e-01,   6.91665836e-01,
               -3.81396008e-02,   9.99783643e-01,   1.06852549e+00,
                4.99891822e-01,   2.96470548e+00,  -3.81396008e-02,
               -9.60737282e-01,  -1.66800046e+00,   2.96470548e+00,
               -7.50769428e-01,   9.99121296e-01,   2.72576706e+00,
               -1.42619463e+00,  -1.73172168e-01,   1.67013145e+00,
               -4.10294260e+00,   1.50554018e+00,  -1.26751149e-01,
                2.15567218e-01,   2.09492639e+00,  -8.57814601e-02,
                1.78906667e-01,   2.29917608e-02,   1.15192931e+00,
               -9.60737282e-01,   1.16419664e+00,  -9.27467655e-01,
                1.05087220e-01,   1.37477100e+00,  -1.01895384e-01,
                2.09492639e+00,   1.16419664e+00,   1.15192931e+00,
                9.99121296e-01,   

In [90]:
evaluation_summary = model.evaluate(test)

In [96]:
print("Accuracy is:",evaluation_summary.accuracy)

('Accuracy is:', 0.5333333333333333)


In [98]:
print("precision is:")
evaluation_summary.precisionByLabel

precision is:


[0.5, 0.6666666666666666]

## Used links

[1] https://databricks.com/spark/getting-started-with-apache-spark/machine-learning

[2] https://www.bmc.com/blogs/python-spark-machine-learning-classification/

[3] https://spark.apache.org/docs/latest/ml-classification-regression.html#binomial-logistic-regression