# Ex3 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MyApp').getOrCreate()
sc = spark.sparkContext

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users and use the 'user_id' as index

In [5]:
from pyspark import SparkFiles

path=r'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
spark.sparkContext.addFile(path)
users = spark.read.option('sep','|').option('header','true').option('inferSchema','true') \
    .csv('file:///' + SparkFiles.get('u.user'))

### Step 4. See the first 25 entries

In [6]:
users.show(25)

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     12| 28|     F|        other|   06405|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     16| 21|     M|entertainment|   10309|
|     17| 30|     M|   programmer|   06355|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemake

### Step 5. See the last 10 entries

In [7]:
users.tail(10)

[Row(user_id=934, age=61, gender='M', occupation='engineer', zip_code='22902'),
 Row(user_id=935, age=42, gender='M', occupation='doctor', zip_code='66221'),
 Row(user_id=936, age=24, gender='M', occupation='other', zip_code='32789'),
 Row(user_id=937, age=48, gender='M', occupation='educator', zip_code='98072'),
 Row(user_id=938, age=38, gender='F', occupation='technician', zip_code='55038'),
 Row(user_id=939, age=26, gender='F', occupation='student', zip_code='33319'),
 Row(user_id=940, age=32, gender='M', occupation='administrator', zip_code='02215'),
 Row(user_id=941, age=20, gender='M', occupation='student', zip_code='97229'),
 Row(user_id=942, age=48, gender='F', occupation='librarian', zip_code='78209'),
 Row(user_id=943, age=22, gender='M', occupation='student', zip_code='77841')]

### Step 6. What is the number of observations in the dataset?

In [8]:
users.count()

943

### Step 7. What is the number of columns in the dataset?

In [11]:
len(users.columns)

5

### Step 8. Print the name of all the columns.

In [12]:
users.columns

['user_id', 'age', 'gender', 'occupation', 'zip_code']

### Step 9. How is the dataset indexed?

### Step 10. What is the data type of each column?

In [13]:
users.dtypes

[('user_id', 'int'),
 ('age', 'int'),
 ('gender', 'string'),
 ('occupation', 'string'),
 ('zip_code', 'string')]

### Step 11. Print only the occupation column

In [15]:
users[['occupation']].show(10)

+-------------+
|   occupation|
+-------------+
|   technician|
|        other|
|       writer|
|   technician|
|        other|
|    executive|
|administrator|
|administrator|
|      student|
|       lawyer|
+-------------+
only showing top 10 rows



### Step 12. How many different occupations are in this dataset?

In [16]:
users[['occupation']].distinct().count()

21

### Step 13. What is the most frequent occupation?

In [17]:
from pyspark.sql.functions import col

In [18]:
users[['occupation']].groupBy(col('occupation')).count().orderBy(col('count').desc()).show()

+-------------+-----+
|   occupation|count|
+-------------+-----+
|      student|  196|
|        other|  105|
|     educator|   95|
|administrator|   79|
|     engineer|   67|
|   programmer|   66|
|    librarian|   51|
|       writer|   45|
|    executive|   32|
|    scientist|   31|
|       artist|   28|
|   technician|   27|
|    marketing|   26|
|entertainment|   18|
|   healthcare|   16|
|      retired|   14|
|       lawyer|   12|
|     salesman|   12|
|         none|    9|
|    homemaker|    7|
+-------------+-----+
only showing top 20 rows



### Step 14. Summarize the DataFrame.

In [22]:
users.describe().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  null|         null| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  null|         null|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+-------------+------------------+



### Step 15. Summarize all the columns

In [23]:
users.summary().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  null|         null| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  null|         null|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    25%|              236|               25|  null|         null|           21227.0|
|    50%|              472|               31|  null|         null|           53711.0|
|    75%|              708|               43|  null|         null|           78741.0|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+--

### Step 16. Summarize only the occupation column

In [25]:
users[['occupation']].summary().show()

+-------+-------------+
|summary|   occupation|
+-------+-------------+
|  count|          943|
|   mean|         null|
| stddev|         null|
|    min|administrator|
|    25%|         null|
|    50%|         null|
|    75%|         null|
|    max|       writer|
+-------+-------------+



### Step 17. What is the mean age of users?

In [27]:
from pyspark.sql.functions import avg

In [28]:
users[['age']].select(avg(col('age'))).show()

+-----------------+
|         avg(age)|
+-----------------+
|34.05196182396607|
+-----------------+



### Step 18. What is the age with least occurrence?

In [33]:
users[['age']].groupBy('age').count().orderBy(col('count')).show()

+---+-----+
|age|count|
+---+-----+
|  7|    1|
| 10|    1|
| 73|    1|
| 11|    1|
| 66|    1|
| 64|    2|
| 69|    2|
| 62|    2|
| 68|    2|
| 65|    3|
| 61|    3|
| 59|    3|
| 63|    3|
| 70|    3|
| 58|    3|
| 14|    3|
| 54|    4|
| 13|    5|
| 16|    5|
| 52|    6|
+---+-----+
only showing top 20 rows

