In [1]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=b252f72c294676e453f0bf10874bb67a73115491cb5f31de393d113ef4325ada
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
! pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
import findspark

In [4]:
findspark.init()

In [7]:
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext

In [8]:
sc = SparkContext()
sqlcon = SQLContext(sc)



In [9]:
sc

In [10]:
data = sqlcon.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('train.csv')

In [11]:
data.show(5)

+-------------------+--------------+--------------------+---------+----------+--------------+--------------------+-------------------+------------------+
|              Dates|      Category|            Descript|DayOfWeek|PdDistrict|    Resolution|             Address|                  X|                 Y|
+-------------------+--------------+--------------------+---------+----------+--------------+--------------------+-------------------+------------------+
|2015-05-13 23:53:00|      WARRANTS|      WARRANT ARREST|Wednesday|  NORTHERN|ARREST, BOOKED|  OAK ST / LAGUNA ST|  -122.425891675136|  37.7745985956747|
|2015-05-13 23:53:00|OTHER OFFENSES|TRAFFIC VIOLATION...|Wednesday|  NORTHERN|ARREST, BOOKED|  OAK ST / LAGUNA ST|  -122.425891675136|  37.7745985956747|
|2015-05-13 23:33:00|OTHER OFFENSES|TRAFFIC VIOLATION...|Wednesday|  NORTHERN|ARREST, BOOKED|VANNESS AV / GREE...|   -122.42436302145|  37.8004143219856|
|2015-05-13 23:30:00| LARCENY/THEFT|GRAND THEFT FROM ...|Wednesday|  NORTHER

In [12]:
drop_list = ['Dates','DayOfWeek','PdDistrict','Resolution','Address','X','Y']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+--------------+--------------------+
|      Category|            Descript|
+--------------+--------------------+
|      WARRANTS|      WARRANT ARREST|
|OTHER OFFENSES|TRAFFIC VIOLATION...|
|OTHER OFFENSES|TRAFFIC VIOLATION...|
| LARCENY/THEFT|GRAND THEFT FROM ...|
| LARCENY/THEFT|GRAND THEFT FROM ...|
+--------------+--------------------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import col
data.groupBy("Category").count().orderBy(col('count')).show(50)

+--------------------+------+
|            Category| count|
+--------------------+------+
|                TREA|     6|
|PORNOGRAPHY/OBSCE...|    22|
|            GAMBLING|   146|
|SEX OFFENSES NON ...|   148|
|           EXTORTION|   256|
|             BRIBERY|   289|
|          BAD CHECKS|   406|
|     FAMILY OFFENSES|   491|
|             SUICIDE|   508|
|        EMBEZZLEMENT|  1166|
|           LOITERING|  1225|
|               ARSON|  1513|
|         LIQUOR LAWS|  1903|
|             RUNAWAY|  1946|
|DRIVING UNDER THE...|  2268|
|          KIDNAPPING|  2341|
|   RECOVERED VEHICLE|  3138|
|         DRUNKENNESS|  4280|
|  DISORDERLY CONDUCT|  4320|
|SEX OFFENSES FORC...|  4388|
|     STOLEN PROPERTY|  4540|
|            TRESPASS|  7326|
|        PROSTITUTION|  7484|
|         WEAPON LAWS|  8555|
|     SECONDARY CODES|  9985|
|FORGERY/COUNTERFE...| 10609|
|               FRAUD| 16679|
|             ROBBERY| 23000|
|      MISSING PERSON| 25989|
|      SUSPICIOUS OCC| 31414|
|         

In [16]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, OneHotEncoder, StringIndexer, VectorAssembler, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

In [17]:
regextoknizerrr= RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W" )


stopwordsremoverr = StopWordsRemover(inputCol="words" , outputCol="filiterd")

CountVectorizerr = CountVectorizer(inputCol="filiterd" , outputCol="features" , vocabSize=10000 , minDF=5)

label_string = StringIndexer(inputCol="Category" , outputCol="label")

In [18]:
from pyspark.ml import Pipeline

piplinne = Pipeline(stages=[regextoknizerrr,stopwordsremoverr,CountVectorizerr,label_string])

piplinefit = piplinne.fit(data)
dataset= piplinefit.transform(data)
dataset.show(5)

+--------------+--------------------+--------------------+--------------------+--------------------+-----+
|      Category|            Descript|               words|            filiterd|            features|label|
+--------------+--------------------+--------------------+--------------------+--------------------+-----+
|      WARRANTS|      WARRANT ARREST|   [warrant, arrest]|   [warrant, arrest]|(781,[13,26],[1.0...|  7.0|
|OTHER OFFENSES|TRAFFIC VIOLATION...|[traffic, violati...|[traffic, violati...|(781,[8,13,29],[1...|  1.0|
|OTHER OFFENSES|TRAFFIC VIOLATION...|[traffic, violati...|[traffic, violati...|(781,[8,13,29],[1...|  1.0|
| LARCENY/THEFT|GRAND THEFT FROM ...|[grand, theft, fr...|[grand, theft, lo...|(781,[0,1,2,4],[1...|  0.0|
| LARCENY/THEFT|GRAND THEFT FROM ...|[grand, theft, fr...|[grand, theft, lo...|(781,[0,1,2,4],[1...|  0.0|
+--------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [22]:
(traindata,testdata) = dataset.randomSplit([0.75,0.25] , seed=623)

print("train is" , traindata.count())
print("test is" , testdata.count())

train is 659025
test is 219024


In [26]:
lr = LogisticRegression(maxIter=20 , regParam=0.3, elasticNetParam=0)

lrmodel = lr.fit(traindata)

In [27]:
perdections = lrmodel.transform(testdata)

In [30]:
perdections

DataFrame[Category: string, Descript: string, words: array<string>, filiterd: array<string>, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [34]:
perdections.select("Descript","Category","probability","label","prediction").orderBy("probability", ascending=False).show()

+--------------------+-------------+--------------------+-----+----------+
|            Descript|     Category|         probability|label|prediction|
+--------------------+-------------+--------------------+-----+----------+
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, BICYCLE, <...|LARCENY/THEFT|[0.89024284641874...|  0.0|       0.0|
|THEFT, GRAND, BY ...|LARCENY/THEFT|[0.88703324369410...|  0.0|       0.0|
|THEFT, GRAND, BY ...|LAR

In [35]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print(evaluator.evaluate(perdections))

0.9736743524003013
