For our first taste of programming with Spark, we'll revisit the Lending Club dataset you first used earlier in the course. Instead of building a random forest though, we'll perform a logistic regression to determine the likelihood that someone will be approved for a loan.

Some notes:
* data loading: Pyspark can't naively load a csv from a URL so we want to copy the data file into our container and load from there. This isn't the case for something like HDFS however, you can load directly from Hadoop. For details see [here](https://spark.apache.org/docs/latest/rdd-programming-guide.html#external-datasets)
* Note on dropping columns - need to create a new dataframe. It won't remove columns in-place.

* Data prep strategy:
    1. Drop columns that have high portion of nulls. Only keep columns with fewer than 10 nulls total. Otherwise drop it.
    2. From remaining columns, review the schema and type. Cast any string columns that should be numeric into the appropriate value.
    3. Once all column types are correct, look at the categorical variables, and create the dummies as needed.
    
* Here is the big thing - the data then needs to be transformed from a multicolumn CSV dataframe into a label / feature RDD.
    * The RDD has to have only two columns: labels and a vector with all the features.
    * To achieve this we will combine remaining numeric variables and the dummy variables, then compress them into single row vectors.
    
* At that point data prep will be complete and we can run the random forest.

In [1]:
CSV_PATH = "/home/ds/notebooks/datasets/LoanStats3d.csv"
APP_NAME = "Lending Club Random Forest Example"
SPARK_URL = "local[*]"
RANDOM_SEED = 141107
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 10
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName(APP_NAME) \
    .master(SPARK_URL) \
    .getOrCreate()

df = spark.read \
    .options(header = "true", inferschema = "true") \
    .csv(CSV_PATH)

In [4]:
from pyspark.sql.functions import isnan, when, count, col

null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas().to_dict(orient='records')

In [5]:
null_counts[0]

{'acc_now_delinq': 1,
 'acc_open_past_24mths': 1,
 'addr_state': 0,
 'all_util': 399723,
 'annual_inc': 0,
 'annual_inc_joint': 420583,
 'application_type': 0,
 'avg_cur_bal': 0,
 'bc_open_to_buy': 3963,
 'bc_util': 4227,
 'chargeoff_within_12_mths': 0,
 'collection_recovery_fee': 0,
 'collections_12_mths_ex_med': 0,
 'debt_settlement_flag': 0,
 'debt_settlement_flag_date': 415607,
 'deferral_term': 419185,
 'delinq_2yrs': 0,
 'delinq_amnt': 0,
 'desc': 421050,
 'disbursement_method': 1,
 'dti': 2,
 'dti_joint': 420586,
 'earliest_cr_line': 0,
 'emp_length': 0,
 'emp_title': 23872,
 'funded_amnt': 0,
 'funded_amnt_inv': 0,
 'grade': 0,
 'hardship_amount': 419185,
 'hardship_dpd': 419185,
 'hardship_end_date': 419185,
 'hardship_flag': 1,
 'hardship_last_payment_amount': 419185,
 'hardship_length': 419185,
 'hardship_loan_status': 419185,
 'hardship_payoff_balance_amount': 419185,
 'hardship_reason': 419185,
 'hardship_start_date': 419185,
 'hardship_status': 419185,
 'hardship_type': 4

In [6]:
null_columns_to_drop = [key for key, value in null_counts[0].items() if value > 10]

In [7]:
null_columns_to_drop

['id',
 'member_id',
 'emp_title',
 'url',
 'desc',
 'title',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'revol_util',
 'last_pymnt_d',
 'next_pymnt_d',
 'mths_since_last_major_derog',
 'annual_inc_joint',
 'dti_joint',
 'verification_status_joint',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'bc_open_to_buy',
 'bc_util',
 'mo_sin_old_il_acct',
 'mths_since_recent_bc',
 'mths_since_recent_bc_dlq',
 'mths_since_recent_inq',
 'mths_since_recent_revol_delinq',
 'num_tl_120dpd_2m',
 'percent_bc_gt_75',
 'revol_bal_joint',
 'sec_app_earliest_cr_line',
 'sec_app_inq_last_6mths',
 'sec_app_mort_acc',
 'sec_app_open_acc',
 'sec_app_revol_util',
 'sec_app_open_act_il',
 'sec_app_num_rev_accts',
 'sec_app_chargeoff_within_12_mths',
 'sec_app_collections_12_mths_ex_med',
 'sec_app_mths_since_last_major_derog',


In [8]:
print(len(df.columns))
print(len(null_counts[0].keys()))
print(len(null_columns_to_drop))

145
145
69


In [9]:
df_updated = df.drop(*null_columns_to_drop)

In [10]:
print(len(df_updated.columns))

76


In [11]:
print("Total number of rows: %d" % df_updated.count())

Total number of rows: 421095


In [12]:
df_updated.printSchema()

root
 |-- loan_amnt: integer (nullable = true)
 |-- funded_amnt: integer (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: string (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string (nullable = true)
 |-- delinq_2yrs: double (nullable = true)
 |-- earliest_cr_line: string (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- open_acc: integer (nullable = true)
 |-- pub_rec: integer (nul

We still have a ton of columns and we probably should look at reducing it further. Now let's look at only the columns that are strings, and check whether we should convert them to a numeric datatype.

In [13]:
categoricals = [col[0] for col in df_updated.dtypes if col[1] == 'string']
print(len(categoricals))

27


In [14]:
df_updated.select(categoricals[:6]).show(5)
df_updated.select(categoricals[6:12]).show(5)
df_updated.select(categoricals[12:17]).show(5)
df_updated.select(categoricals[17:22]).show(5)
df_updated.select(categoricals[22:]).show(5)

+----------+--------+-----+---------+----------+--------------+
|      term|int_rate|grade|sub_grade|emp_length|home_ownership|
+----------+--------+-----+---------+----------+--------------+
| 60 months|   9.80%|    B|       B3| 10+ years|      MORTGAGE|
| 60 months|  12.88%|    C|       C2|    1 year|      MORTGAGE|
| 36 months|  15.77%|    D|       D1|   2 years|      MORTGAGE|
| 36 months|  10.78%|    B|       B4|   8 years|          RENT|
| 36 months|   7.49%|    A|       A4| 10+ years|      MORTGAGE|
+----------+--------+-----+---------+----------+--------------+
only showing top 5 rows

+----------+-------------------+--------+-----------+----------+------------------+
|annual_inc|verification_status| issue_d|loan_status|pymnt_plan|           purpose|
+----------+-------------------+--------+-----------+----------+------------------+
|     65000|       Not Verified|Dec-2015| Fully Paid|         n|debt_consolidation|
|     70000|       Not Verified|Dec-2015|    Current|         n

We want to make the following numeric:
term, int_rate, annual_inc, inq_last_6mths, total_acc, out_prncp, dti, last_payment_amnt

We'll drop the following columns:
last_credit_pull_d, issue_d, zip_code, addr_state, earliest_cr_line

In [15]:
from pyspark.sql.functions import regexp_replace

df_updated = df_updated.withColumn('term', regexp_replace(df_updated['term'], " months", "").cast("int"))
df_updated = df_updated.withColumn('int_rate', regexp_replace(df_updated['int_rate'], "%", "").cast("float"))
df_updated = df_updated.withColumn('annual_inc', df_updated['annual_inc'].cast("int"))
df_updated = df_updated.withColumn('inq_last_6mths', df_updated['inq_last_6mths'].cast("int"))
df_updated = df_updated.withColumn('total_acc', df_updated['total_acc'].cast("int"))
df_updated = df_updated.withColumn('out_prncp', df_updated['out_prncp'].cast("float"))
df_updated = df_updated.withColumn('dti', df_updated['dti'].cast("float"))
df_updated = df_updated.withColumn('last_pymnt_amnt', df_updated['last_pymnt_amnt'].cast("float"))

df_final = df_updated.drop('last_credit_pull_d', 'issue_d', 'zip_code', 'addr_state', 'earliest_cr_line')

In [16]:
df_final.printSchema()

root
 |-- loan_amnt: integer (nullable = true)
 |-- funded_amnt: integer (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- term: integer (nullable = true)
 |-- int_rate: float (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: integer (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- dti: float (nullable = true)
 |-- delinq_2yrs: double (nullable = true)
 |-- inq_last_6mths: integer (nullable = true)
 |-- open_acc: integer (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- total_acc: integer (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- out_prncp: flo

In [17]:
categoricals = [col[0] for col in df_final.dtypes if col[1] == 'string']
categoricals.remove('loan_status')
uniques = [df_final.select(col).distinct().count() for col in categoricals]
categorical_unique_counts = dict(zip(categoricals, uniques))

In [18]:
len(categoricals)

13

In [22]:
df_final.select(categoricals[:8]).show(10)
df_final.select(categoricals[8:]).show(10)

+-----+---------+----------+--------------+-------------------+----------+------------------+-------------------+
|grade|sub_grade|emp_length|home_ownership|verification_status|pymnt_plan|           purpose|initial_list_status|
+-----+---------+----------+--------------+-------------------+----------+------------------+-------------------+
|    B|       B3| 10+ years|      MORTGAGE|       Not Verified|         n|debt_consolidation|                  w|
|    C|       C2|    1 year|      MORTGAGE|       Not Verified|         n|debt_consolidation|                  w|
|    D|       D1|   2 years|      MORTGAGE|       Not Verified|         n|  home_improvement|                  w|
|    B|       B4|   8 years|          RENT|    Source Verified|         n|debt_consolidation|                  w|
|    A|       A4| 10+ years|      MORTGAGE|       Not Verified|         n|debt_consolidation|                  w|
|    C|       C4| 10+ years|           OWN|    Source Verified|         n|debt_consolida

In [34]:
cols = df_final.columns

In [36]:
cols.remove('loan_status')

In [45]:
final_cols = []
final_cols.append('loan_status')
final_cols.extend(cols)

In [48]:
df_final = df_final.select(*final_cols)

In [52]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

transformed_df = df_final.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

splits = [TRAINING_DATA_RATIO, 1.0 - TRAINING_DATA_RATIO]
training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 55.0 failed 1 times, most recent failure: Lost task 1.0 in stage 55.0 (TID 2691, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 346, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 1041, in <lambda>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/usr/local/spark/python/pyspark/rdd.py", line 1041, in <genexpr>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/usr/local/spark/python/pyspark/rddsampler.py", line 95, in func
    for obj in iterator:
  File "<ipython-input-52-92eb4d49f41a>", line 4, in <lambda>
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 882, in dense
    return DenseVector(elements)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 289, in __init__
    ar = np.array(ar, dtype=np.float64)
ValueError: could not convert string to float: 'Fully Paid'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:467)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 177, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 2423, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/local/spark/python/pyspark/rdd.py", line 346, in func
    return f(iterator)
  File "/usr/local/spark/python/pyspark/rdd.py", line 1041, in <lambda>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/usr/local/spark/python/pyspark/rdd.py", line 1041, in <genexpr>
    return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
  File "/usr/local/spark/python/pyspark/rddsampler.py", line 95, in func
    for obj in iterator:
  File "<ipython-input-52-92eb4d49f41a>", line 4, in <lambda>
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 882, in dense
    return DenseVector(elements)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 289, in __init__
    ar = np.array(ar, dtype=np.float64)
ValueError: could not convert string to float: 'Fully Paid'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [20]:
categorical_unique_counts

{'application_type': 3,
 'collections_12_mths_ex_med': 11,
 'debt_settlement_flag': 3,
 'disbursement_method': 2,
 'emp_length': 13,
 'grade': 7,
 'hardship_flag': 3,
 'home_ownership': 5,
 'initial_list_status': 3,
 'purpose': 15,
 'pymnt_plan': 3,
 'sub_grade': 35,
 'verification_status': 4}

now we need to convert the categoricals to dummies

In [21]:
categorical_dummies = [c+'_dmy' for c in categoricals]
 
indexers = [StringIndexer(inputCol=x, outputCol=x+'_tmp')
            for x in categoricals ]
 
encoders = [OneHotEncoder(dropLast=False, inputCol=x+"_tmp", outputCol=y) 
            for x,y in zip(categoricals, categorical_dummies)]

tmp = [[i,j] for i,j in zip(indexers, encoders)]
tmp = [i for sublist in tmp for i in sublist]

In [None]:
tmp

In [None]:
# prepare labeled sets
final_cols = [col[0] for col in df_final.dtypes if col[1] != 'string'].extend(categorical_dummies)

print(non_categoricals)

In [None]:
assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features')
labelIndexer = StringIndexer(inputCol='loan_status', outputCol="label")
tmp += [assembler_features, labelIndexer]
pipeline = Pipeline(stages=tmp)

In [None]:
pipeline.fit(df_final)

In [None]:
pipeline

In [None]:
allData = pipeline.fit(df_final).transform(df_final)
allData.cache()
trainingData, testData = allData.randomSplit([TRAINING_DATA_RATIO, 1-TRAINING_DATA_RATIO], seed=RANDOM_SEED) # need to ensure same split for each time
print("Distribution of loan status in trainingData is: ", trainingData.groupBy("label").count().take(3))

In [None]:
df_updated.select("loan_amnt", "int_rate", "loan_status").show(10)

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="loan_status", outputCol="indexed_loan_status").fit(df_updated)

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import functools
%matplotlib inline
 
statuses = df_updated.groupBy('loan_status').count().collect()
categories = [i[0] for i in statuses]
counts = [i[1] for i in statuses]
 
ind = np.array(range(len(categories)))
width = 0.35
plt.bar(ind, counts, width=width, color='r')
 
plt.ylabel('counts')
plt.title('Status distribution')
plt.xticks(ind + width/2., categories)