Creating SparkSession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Income Prediction").config("spark.memory.offHeap.enabled", "true").config("spark.memory.offHeap.size", "10g").getOrCreate()

Reading my data and putting it into a dataframe

In [6]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [9]:
schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("workclass", StringType(), True),
    StructField("fnlwgt", IntegerType(), True),
    StructField("education", StringType(), True),
    StructField("education_num", IntegerType(), True),
    StructField("marital_status", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("race", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("capital_gain", IntegerType(), True),
    StructField("capital_loss", IntegerType(), True),
    StructField("hours_per_week", IntegerType(), True),
    StructField("native_country", StringType(), True),
    StructField("income", StringType(), True)
])

df1 = spark.read.csv("adult test.csv", header=True, schema=schema, sep=",", encoding="UTF-8")

In [10]:
df1.show(5, 0)

+----+---------+------+---------+-------------+--------------+----------+------------+----+----+------------+------------+--------------+--------------+------+
|age |workclass|fnlwgt|education|education_num|marital_status|occupation|relationship|race|sex |capital_gain|capital_loss|hours_per_week|native_country|income|
+----+---------+------+---------+-------------+--------------+----------+------------+----+----+------------+------------+--------------+--------------+------+
|NULL|NULL     |NULL  |NULL     |NULL         |NULL          |NULL      |NULL        |NULL|NULL|NULL        |NULL        |NULL          |NULL          |NULL  |
|NULL|NULL     |NULL  |NULL     |NULL         |NULL          |NULL      |NULL        |NULL|NULL|NULL        |NULL        |NULL          |NULL          |NULL  |
|NULL|NULL     |NULL  |NULL     |NULL         |NULL          |NULL      |NULL        |NULL|NULL|NULL        |NULL        |NULL          |NULL          |NULL  |
|NULL|NULL     |NULL  |NULL     |NULL   

In [11]:
raw_data = spark.read.text("adult test.csv")
raw_data.show(10, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------+
||1x3 Cross validator                                                                                                                        |
|"25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K."                   |
|"38, Private, 89814, HS-grad, 9, Married-civ-spouse, Farming-fishing, Husband, White, Male, 0, 0, 50, United-States, <=50K."                |
|"28, Local-gov, 336951, Assoc-acdm, 12, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K."          |

In [12]:
df = spark.read.csv(
    "adult test.csv",
    header=True,
    inferSchema=True,
    sep=",",
    quote="\"",
    escape="\""
)
df.show(5, truncate=False)
df.printSchema()

+-------------------------------------------------------------------------------------------------------------------------------------+
||1x3 Cross validator                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------+
|25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K.              |
|38, Private, 89814, HS-grad, 9, Married-civ-spouse, Farming-fishing, Husband, White, Male, 0, 0, 50, United-States, <=50K.           |
|28, Local-gov, 336951, Assoc-acdm, 12, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K.     |
|44, Private, 160323, Some-college, 10, Married-civ-spouse, Machine-op-inspct, Husband, Black, Male, 7688, 0, 40, United-States, >50K.|
|18, ?, 103497, Some-college, 10, Never-married,

In [16]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("workclass", StringType(), True),
    StructField("fnlwgt", IntegerType(), True),
    StructField("education", StringType(), True),
    StructField("education_num", IntegerType(), True),
    StructField("marital_status", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("race", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("capital_gain", IntegerType(), True),
    StructField("capital_loss", IntegerType(), True),
    StructField("hours_per_week", IntegerType(), True),
    StructField("native_country", StringType(), True),
    StructField("income", StringType(), True)
])

df = spark.read.csv(
    "adult test.csv",
    header=True,
    #schema=schema,
    sep=",",
    quote="\"",
    escape="\""
)
pandas_df = df.toPandas()
pandas_df

Unnamed: 0,|1x3 Cross validator
0,"25, Private, 226802, 11th, 7, Never-married, M..."
1,"38, Private, 89814, HS-grad, 9, Married-civ-sp..."
2,"28, Local-gov, 336951, Assoc-acdm, 12, Married..."
3,"44, Private, 160323, Some-college, 10, Married..."
4,"18, ?, 103497, Some-college, 10, Never-married..."
...,...
16276,"39, Private, 215419, Bachelors, 13, Divorced, ..."
16277,"64, ?, 321403, HS-grad, 9, Widowed, ?, Other-r..."
16278,"38, Private, 374983, Bachelors, 13, Married-ci..."
16279,"44, Private, 83891, Bachelors, 13, Divorced, A..."
