1. create a spark dataframe without using a csv reader 
2. Make sure each column is the correct type
3. handle NA values

In [19]:
from pyspark.sql import SQLContext, Row,DataFrame
from pyspark.sql.types import *
filename = 'airquality.csv'
rdd = sc.textFile(filename)

# rdd.map(lambda x: if x=='NA')
# rdd.collect()[0].split(',')

rdd2=rdd.map(lambda x: x.split(','))
header = rdd2.first()
rdd_noHeader=rdd2.filter(lambda x: x != header)


#infer type based on what is there
def infer_type(x):
    if '"' in x:
        x = x.replace('"','')
        
    if '.' in x:
        return float(x)
    elif x =='NA':
        return None
    else:
        return int(x) 
        
cleaned = rdd_noHeader.map(lambda x: [infer_type(a) for a in x])
header_stripped = [x.replace('"','') for x in header]

In [38]:
# converting to DatFrame w/o Schema
df = cleaned.toDF(header_stripped)
df.show()
print df

+---+-----+-------+----+----+-----+---+
|   |Ozone|Solar.R|Wind|Temp|Month|Day|
+---+-----+-------+----+----+-----+---+
|  1|   41|    190| 7.4|  67|    5|  1|
|  2|   36|    118|null|  72|    5|  2|
|  3|   12|    149|12.6|  74|    5|  3|
|  4|   18|    313|11.5|  62|    5|  4|
|  5| null|   null|14.3|  56|    5|  5|
|  6|   28|   null|14.9|  66|    5|  6|
|  7|   23|    299| 8.6|  65|    5|  7|
|  8|   19|     99|13.8|  59|    5|  8|
|  9|    8|     19|20.1|  61|    5|  9|
| 10| null|    194| 8.6|  69|    5| 10|
| 11|    7|   null| 6.9|  74|    5| 11|
| 12|   16|    256| 9.7|  69|    5| 12|
| 13|   11|    290| 9.2|  66|    5| 13|
| 14|   14|    274|10.9|  68|    5| 14|
| 15|   18|     65|13.2|  58|    5| 15|
| 16|   14|    334|11.5|  64|    5| 16|
| 17|   34|    307|null|  66|    5| 17|
| 18|    6|     78|18.4|  57|    5| 18|
| 19|   30|    322|11.5|  68|    5| 19|
| 20|   11|     44| 9.7|  62|    5| 20|
+---+-----+-------+----+----+-----+---+
only showing top 20 rows

DataFrame[: bi

In [46]:
# alternatively, we could impose a schema
schema_ = StructType([StructField('index',IntegerType(),True),\
           StructField(header_stripped[1],IntegerType(),True),\
           StructField(header_stripped[2],IntegerType(),True),\
           StructField(header_stripped[3],DoubleType(),True),\
           StructField(header_stripped[4],IntegerType(),True),\
           StructField(header_stripped[5],IntegerType(),True),\
           StructField(header_stripped[6],IntegerType(),True)])

# shorthand:
df = cleaned.toDF(schema=schema_)

# full:
df = sqlContext.createDataFrame(cleaned,schema_)
df.show()
print df.printSchema()
# df.collect()

+-----+-----+-------+----+----+-----+---+
|index|Ozone|Solar.R|Wind|Temp|Month|Day|
+-----+-----+-------+----+----+-----+---+
|    1|   41|    190| 7.4|  67|    5|  1|
|    2|   36|    118|null|  72|    5|  2|
|    3|   12|    149|12.6|  74|    5|  3|
|    4|   18|    313|11.5|  62|    5|  4|
|    5| null|   null|14.3|  56|    5|  5|
|    6|   28|   null|14.9|  66|    5|  6|
|    7|   23|    299| 8.6|  65|    5|  7|
|    8|   19|     99|13.8|  59|    5|  8|
|    9|    8|     19|20.1|  61|    5|  9|
|   10| null|    194| 8.6|  69|    5| 10|
|   11|    7|   null| 6.9|  74|    5| 11|
|   12|   16|    256| 9.7|  69|    5| 12|
|   13|   11|    290| 9.2|  66|    5| 13|
|   14|   14|    274|10.9|  68|    5| 14|
|   15|   18|     65|13.2|  58|    5| 15|
|   16|   14|    334|11.5|  64|    5| 16|
|   17|   34|    307|null|  66|    5| 17|
|   18|    6|     78|18.4|  57|    5| 18|
|   19|   30|    322|11.5|  68|    5| 19|
|   20|   11|     44| 9.7|  62|    5| 20|
+-----+-----+-------+----+----+---

In [101]:
# using Row() 
# cleaned = rdd_noHeader.map(lambda x: [infer_type(a) for a in x])
# rdd_noHeader.map(lambda x: [Row(header_stripped[i]=infer_type(a)) for i,a in enumerate(x)])
# convert the NAs first
rdd_partialClean = rdd_noHeader.map(lambda x: [None if a=='NA' else a for a in x])
# rdd_partialClean.map(lambda x: [Row(=infer_type(a)) for i,a in enumerate(x)])

# def convert_types(x):
#     typesL = [int,int,float,int,int,int]
#     for a in x:
#         if a != None:
            
    
# rdd_cleaned=rdd_partialClean.map(lambda x: Row(index=str(x[0]),\
#                                    Ozone=int(x[1]),\
#                                   Solar=int(x[2]),\
#                                   Wind = float(x[3]),\
#                                   Temp=int(x[4]),\
#                                   Month=int(x[5]),\
#                                   Day=int(x[6])))


# sqlContext.createDataFrame(rdd_cleaned).show()
rdd_partialClean.map(lambda x: Row(index=str(x[0]),Ozone=int(x[1]))).collect()
# rdd_partialClean.map(lambda x: x[0]).collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 78.0 failed 1 times, most recent failure: Lost task 0.0 in stage 78.0 (TID 290, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/jo/spark/spark-1.6.1-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/jo/spark/spark-1.6.1-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/jo/spark/spark-1.6.1-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-101-ed6e3c78205a>", line 11, in <lambda>
TypeError: int() argument must be a string or a number, not 'NoneType'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:129)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:125)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:927)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/jo/spark/spark-1.6.1-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/home/jo/spark/spark-1.6.1-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/jo/spark/spark-1.6.1-bin-hadoop2.4/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-101-ed6e3c78205a>", line 11, in <lambda>
TypeError: int() argument must be a string or a number, not 'NoneType'

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:129)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:125)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:927)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	... 1 more


In [103]:
None == 3

False

In [62]:
# register the table 
df.registerTempTable('mytable')

# query 
sqlContext.sql('select * from mytable where Month = 6').show(30)

+-----+-----+-------+----+----+-----+---+
|index|Ozone|Solar.R|Wind|Temp|Month|Day|
+-----+-----+-------+----+----+-----+---+
|   32| null|    286| 8.6|  78|    6|  1|
|   33| null|    287| 9.7|  74|    6|  2|
|   34| null|    242|16.1|  67|    6|  3|
|   35| null|    186| 9.2|  84|    6|  4|
|   36| null|    220| 8.6|  85|    6|  5|
|   37| null|    264|14.3|  79|    6|  6|
|   38|   29|    127| 9.7|  82|    6|  7|
|   39| null|    273| 6.9|  87|    6|  8|
|   40|   71|    291|13.8|  90|    6|  9|
|   41|   39|    323|11.5|  87|    6| 10|
|   42| null|    259|10.9|  93|    6| 11|
|   43| null|    250| 9.2|  92|    6| 12|
|   44|   23|    148|null|  82|    6| 13|
|   45| null|    332|13.8|  80|    6| 14|
|   46| null|    322|11.5|  79|    6| 15|
|   47|   21|    191|14.9|  77|    6| 16|
|   48|   37|    284|20.7|  72|    6| 17|
|   49|   20|     37| 9.2|  65|    6| 18|
|   50|   12|    120|11.5|  73|    6| 19|
|   51|   13|    137|10.3|  76|    6| 20|
|   52| null|    150| 6.3|  77|   