In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [3]:
 X.head()   #here u will get the datset in the from of rows and columns means in pandas table

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [4]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=74d177f5cac9b893e508ab30f4a9a603d4e8a2d51fe3a6745d56f9ae1c1591f0
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [5]:
import pyspark
from pyspark.sql import SparkSession

In [7]:
# Create a Spark session
spark = SparkSession.builder.appName("income_dataset").getOrCreate()

In [8]:
data=spark.read.csv('/content/adult.data')
data.show()

+---+-----------------+-------+-------------+---+--------------------+------------------+--------------+-------------------+-------+------+----+----+--------------+------+
|_c0|              _c1|    _c2|          _c3|_c4|                 _c5|               _c6|           _c7|                _c8|    _c9|  _c10|_c11|_c12|          _c13|  _c14|
+---+-----------------+-------+-------------+---+--------------------+------------------+--------------+-------------------+-------+------+----+----+--------------+------+
| 39|        State-gov|  77516|    Bachelors| 13|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|  2174|   0|  40| United-States| <=50K|
| 50| Self-emp-not-inc|  83311|    Bachelors| 13|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|     0|   0|  13| United-States| <=50K|
| 38|          Private| 215646|      HS-grad|  9|            Divorced| Handlers-cleaners| Not-in-family|              White|   Male|     0| 

In [9]:
#adding label name to the columns
labels=['age','workclass','fnlwgt','education','numbers','marital','occupation','relation','race','gender','gain','loss','hourlypay','country','income']

In [10]:
df=data.toDF(*labels)
df.show(5)

+---+-----------------+-------+----------+-------+-------------------+------------------+--------------+------+-------+-----+----+---------+--------------+------+
|age|        workclass| fnlwgt| education|numbers|            marital|        occupation|      relation|  race| gender| gain|loss|hourlypay|       country|income|
+---+-----------------+-------+----------+-------+-------------------+------------------+--------------+------+-------+-----+----+---------+--------------+------+
| 39|        State-gov|  77516| Bachelors|     13|      Never-married|      Adm-clerical| Not-in-family| White|   Male| 2174|   0|       40| United-States| <=50K|
| 50| Self-emp-not-inc|  83311| Bachelors|     13| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|    0|   0|       13| United-States| <=50K|
| 38|          Private| 215646|   HS-grad|      9|           Divorced| Handlers-cleaners| Not-in-family| White|   Male|    0|   0|       40| United-States| <=50K|
| 53|          Private

In [11]:
df.printSchema()

root
 |-- age: string (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: string (nullable = true)
 |-- education: string (nullable = true)
 |-- numbers: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relation: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gain: string (nullable = true)
 |-- loss: string (nullable = true)
 |-- hourlypay: string (nullable = true)
 |-- country: string (nullable = true)
 |-- income: string (nullable = true)



In [13]:
from pyspark.sql.functions import col
new_df=df.withColumn('age',col('age').cast('integer'))

In [14]:
new_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: string (nullable = true)
 |-- education: string (nullable = true)
 |-- numbers: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relation: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gain: string (nullable = true)
 |-- loss: string (nullable = true)
 |-- hourlypay: string (nullable = true)
 |-- country: string (nullable = true)
 |-- income: string (nullable = true)



In [23]:
for i in ['fnlwgt','numbers','gain','loss','hourlypay']:
  new_df=new_df.withColumn(i,col(i).cast('integer'))

In [24]:
from pyspark.sql.functions import *
new_df.select([count(when(col(c).isNull(),c )).alias(c) for c in new_df.columns]).show()

+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|age|workclass|fnlwgt|education|numbers|marital|occupation|relation|race|gender|gain|loss|hourlypay|country|income|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|  0|        0|     0|        0|      0|      0|         0|       0|   0|     0|   0|   0|        0|      0|     0|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+



In [25]:
df.select('workclass').distinct().show()  #null values are present

+-----------------+
|        workclass|
+-----------------+
|        State-gov|
|      Federal-gov|
| Self-emp-not-inc|
|        Local-gov|
|          Private|
|                ?|
|     Self-emp-inc|
|      Without-pay|
|     Never-worked|
+-----------------+



In [26]:
df.groupby('workclass').count().show()

+-----------------+-----+
|        workclass|count|
+-----------------+-----+
|        State-gov| 1298|
|      Federal-gov|  960|
| Self-emp-not-inc| 2541|
|        Local-gov| 2093|
|          Private|22696|
|                ?| 1836|
|     Self-emp-inc| 1116|
|      Without-pay|   14|
|     Never-worked|    7|
+-----------------+-----+



In [32]:
#replacing the null values
df=new_df.replace(' ?',None)

In [33]:
from pyspark.sql.functions import *
df.select([count(when(col(c).isNull(),c )).alias(c) for c in df.columns]).show()

+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|age|workclass|fnlwgt|education|numbers|marital|occupation|relation|race|gender|gain|loss|hourlypay|country|income|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|  0|     1836|     0|        0|      0|      0|      1843|       0|   0|     0|   0|   0|        0|    583|     0|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+



In [35]:
df.groupby('country').count().show()

+-------------------+-----+
|            country|count|
+-------------------+-----+
| Dominican-Republic|   70|
|            Ireland|   24|
|               Cuba|   95|
|          Guatemala|   64|
|               Iran|   43|
|             Taiwan|   51|
|        El-Salvador|  106|
|      United-States|29170|
|              South|   80|
|              Japan|   62|
|          Nicaragua|   34|
|               NULL|  583|
|             Canada|  121|
|           Cambodia|   19|
|               Laos|   18|
|            Germany|  137|
|    Trinadad&Tobago|   19|
|               Peru|   31|
|            Ecuador|   28|
|         Yugoslavia|   16|
+-------------------+-----+
only showing top 20 rows



In [36]:
#using the mode method and filling the null values
df=df.fillna('United-States',subset=['country'])

In [37]:
df.groupby('workclass').count().show()

+-----------------+-----+
|        workclass|count|
+-----------------+-----+
|        State-gov| 1298|
|      Federal-gov|  960|
|             NULL| 1836|
| Self-emp-not-inc| 2541|
|        Local-gov| 2093|
|          Private|22696|
|     Self-emp-inc| 1116|
|      Without-pay|   14|
|     Never-worked|    7|
+-----------------+-----+



In [38]:
#using the mode method and filling the null values
df=df.fillna('Private',subset=['workclass'])

In [39]:
df.groupby('occupation').count().show()

+------------------+-----+
|        occupation|count|
+------------------+-----+
|   Farming-fishing|  994|
|              NULL| 1843|
| Handlers-cleaners| 1370|
|    Prof-specialty| 4140|
|      Adm-clerical| 3770|
|   Exec-managerial| 4066|
|      Craft-repair| 4099|
|             Sales| 3650|
|      Tech-support|  928|
|  Transport-moving| 1597|
|   Protective-serv|  649|
|      Armed-Forces|    9|
| Machine-op-inspct| 2002|
|     Other-service| 3295|
|   Priv-house-serv|  149|
+------------------+-----+



In [40]:
df=df.fillna('Prof-specialty',subset=['occupation'])

In [41]:
new_df.select([count(when(col(c).isNull(),c )).alias(c) for c in new_df.columns]).show()

+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|age|workclass|fnlwgt|education|numbers|marital|occupation|relation|race|gender|gain|loss|hourlypay|country|income|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|  0|        0|     0|        0|      0|      0|         0|       0|   0|     0|   0|   0|        0|      0|     0|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+



#Applying Logistic Regression

In [42]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler,StringIndexer


In [43]:
df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'numbers',
 'marital',
 'occupation',
 'relation',
 'race',
 'gender',
 'gain',
 'loss',
 'hourlypay',
 'country',
 'income']

In [44]:
categorical_cols=['age',
 'education',
 'marital',
 'occupation',
 'relation',
 'race',
 'gender',
 'country']

numerical_cols=['age','fnlwgt','numbers','gain','loss','hourlypay']
label='income'

In [45]:
#applying indexer
indexer=[StringIndexer(inputCol=c,outputCol=f'{c}_index',handleInvalid='keep') for c in categorical_cols]

In [46]:
label_indexer=StringIndexer(inputCol='income',outputCol='label',handleInvalid='keep')

In [47]:
assembler=VectorAssembler(inputCols=[f'{c}_index' for c in categorical_cols]+numerical_cols,outputCol='features')

In [48]:
#using logistic regression
lr=LogisticRegression(featuresCol='features',labelCol='label')

In [49]:
pipeline = Pipeline(stages=indexer+[assembler,label_indexer,lr])

In [50]:
train_data,test_data=df.randomSplit([.8,.2])

In [51]:
model=pipeline.fit(train_data)

In [52]:
prediction=model.transform(test_data)

In [53]:
prediction.show()

+---+----------+------+---------+-------+--------------+------------------+---------------+------+-------+----+----+---------+--------------+------+---------+---------------+-------------+----------------+--------------+----------+------------+-------------+--------------------+-----+--------------------+--------------------+----------+
|age| workclass|fnlwgt|education|numbers|       marital|        occupation|       relation|  race| gender|gain|loss|hourlypay|       country|income|age_index|education_index|marital_index|occupation_index|relation_index|race_index|gender_index|country_index|            features|label|       rawPrediction|         probability|prediction|
+---+----------+------+---------+-------+--------------+------------------+---------------+------+-------+----+----+---------+--------------+------+---------+---------------+-------------+----------------+--------------+----------+------------+-------------+--------------------+-----+--------------------+----------------

In [54]:
prediction.select('label','prediction').show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



In [55]:
prediction.select('label','prediction').distinct().show()

+-----+----------+
|label|prediction|
+-----+----------+
|  1.0|       1.0|
|  0.0|       1.0|
|  1.0|       0.0|
|  0.0|       0.0|
+-----+----------+



In [56]:
prediction.groupby('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  806|
|  0.0|       1.0|  344|
|  1.0|       0.0|  739|
|  0.0|       0.0| 4685|
+-----+----------+-----+



In [57]:
 #evaluating the data
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='label',metricName='accuracy')

In [58]:
evaluator.evaluate(prediction)


0.8352601156069365

The accuracy was found to be 84% on the testing data