Creating a pyspark dataframe from an inventory of rows.
For this, we are providing the values to each variable in each row and added to the dataframe object.

In [None]:
!pip install pyspark py4j

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=0fd5331a4bed469f64dc81083d5f09fcd549cd935090337d75c1820cbf1f9887
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from datetime import datetime,date
import pandas as pd
from pyspark.sql import Row

# needed for spark session creation
from pyspark.sql import SparkSession

# creating the session
sp = SparkSession.builder.getOrCreate()

df = sp.createDataFrame([
    Row(a=1,b=4.,c = 'A1', d = date(2000,1,3),
        e = datetime(2000,8,1,12,0)),
    Row(a=2,b=6.,c = 'B1', d = date(2000,6,8),
        e = datetime(2000,6,1,12,0))
])
# show table
df.show()

# show schema
df.printSchema()

+---+---+---+----------+-------------------+
|  a|  b|  c|         d|                  e|
+---+---+---+----------+-------------------+
|  1|4.0| A1|2000-01-03|2000-08-01 12:00:00|
|  2|6.0| B1|2000-06-08|2000-06-01 12:00:00|
+---+---+---+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



Create PySpark DataFrame with an explicit schema

In [None]:
from datetime import datetime,date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

sp = SparkSession.builder.getOrCreate()

df = sp.createDataFrame([
    (1,4.,'A1',date(2000,8,1),
     datetime(2000,8,1,12,0)),

    (2,6.,'B1',date(2000,6,2),
     datetime(2000,6,2,12,0))
], schema = 'a integer, b double, c string, d date, e timestamp')

df.show()

df.printSchema()

+---+---+---+----------+-------------------+
|  a|  b|  c|         d|                  e|
+---+---+---+----------+-------------------+
|  1|4.0| A1|2000-08-01|2000-08-01 12:00:00|
|  2|6.0| B1|2000-06-02|2000-06-02 12:00:00|
+---+---+---+----------+-------------------+

root
 |-- a: integer (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



Create PySpark DataFrame from CSV

In [None]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(pd.read_csv('/content/train_dataset-1.csv'))

#show table
df.show()

#show schema
df.printSchema()

+------+---+--------+-----------+-----------------+-------------+------------+-------------------------+
|Gender|Age|openness|neuroticism|conscientiousness|agreeableness|extraversion|Personality (Class label)|
+------+---+--------+-----------+-----------------+-------------+------------+-------------------------+
|  Male| 17|       7|          4|                7|            3|           2|              extraverted|
|  Male| 19|       4|          5|                4|            6|           6|                  serious|
|Female| 18|       7|          6|                4|            5|           5|               dependable|
|Female| 22|       5|          6|                7|            4|           3|              extraverted|
|Female| 19|       7|          4|                6|            5|           4|                   lively|
|  Male| 18|       5|          7|                7|            6|           4|                   lively|
|Female| 17|       5|          6|                5|    

Creating Pyspark DataFrame from Text file

In [None]:
from datetime import datetime,date

import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession

sp = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(pd.read_csv('/content/tab_separated_values.txt'))

df.show()

df.printSchema()

+------------------------------+
|Value1\tValue2\tValue3\tValue4|
+------------------------------+
|                    1\t2\t3\t4|
|                11\t22\t33\t44|
|            111\t222\t333\t444|
+------------------------------+

root
 |-- Value1\tValue2\tValue3\tValue4: string (nullable = true)



Creating pyspark dataframe for a JSON file.

In [None]:
from datetime import datetime,date

import pandas as pd

from pyspark.sql import Row
from pyspark.sql import SparkSession

sparkss = SparkSession.builder.getOrCreate()

df = sparkss.createDataFrame(pd.read_json('/content/values.json'))

df.show()

df.printSchema()

ValueError: If using all scalar values, you must pass an index