In [1]:
import findspark

findspark.init()
 
from pyspark import SparkConf, SparkContext 
from pyspark.sql.types import StringType
from pyspark.sql import SQLContext 
from pyspark.sql.functions import udf

In [2]:
config = SparkConf().setMaster('local[8]').setAppName('vernacularName') 
sc = SparkContext(conf=config) 
sqlCtx = SQLContext(sc) 

In [9]:
DF = sqlCtx.read.csv(r'../\extract_wikipedia_summaries/summaries_1.csv', header=True, inferSchema=True, sep=',')
DF.registerTempTable("wikipedia_summary")
DF.printSchema() 

root
 |-- _c0: string (nullable = true)
 |-- name: string (nullable = true)
 |-- summary: string (nullable = true)



In [10]:
sqlCtx.sql("SELECT * FROM wikipedia_summary WHERE summary is not NULL LIMIT 5").show()

+---+-------------------+--------------------+
|_c0|               name|             summary|
+---+-------------------+--------------------+
|  0|  Cintractiellaceae|The Cintractiella...|
|  1|       Wallemiaceae|The Wallemiomycet...|
|  2|    Kineosporiaceae|The Kineosporiace...|
|  6|Geodermatophilaceae|The Geodermatophi...|
|  7|       Tremellaceae|The Tremellaceae ...|
+---+-------------------+--------------------+



In [5]:
# Part-of-speech tagging
# Spacy Documentation: https://spacy.io/usage/linguistic-features
# Spacy depency labels : https://spacy.io/api/annotation#dependency-parsing
import spacy
nlp = spacy.load("en_core_web_sm")

@udf(returnType=StringType())
def find_vernacular_name(summary:str):
    '''assumes the first object of preposition in wikipedia summary is the varnacular name'''
    try:
        doc = nlp(summary)
        for token in doc:
            if token.dep_ == 'pobj':#pobj : object of preposition
                break
        return token.lemma_
    except TypeError:
        pass

In [6]:
DF = DF.withColumn('vernacular_name', find_vernacular_name('summary'))

In [7]:
df = DF.toPandas()

In [8]:
df[~df.vernacular_name.isnull()]

Unnamed: 0,_c0,name,summary,vernacular_name
0,0,Cintractiellaceae,The Cintractiellaceae are a family of smut fun...,fungus
1,1,Wallemiaceae,The Wallemiomycetes are a class of fungi in th...,fungus
2,2,Kineosporiaceae,The Kineosporiaceae comprise a polyphyletic Ac...,order
6,6,Geodermatophilaceae,The Geodermatophilaceae are an actinomycete fa...,suborder
7,7,Tremellaceae,The Tremellaceae are a family of fungi in the ...,fungus
...,...,...,...,...
123641,100750,Odontopsammodius,Odontopsammodius is a genus of aphodiine dung ...,beetle
123642,100751,Aidophus,Aidophus is a genus of aphodiinae dung beetles...,beetle
123655,100764,Odontopsammodius cruentus,Odontopsammodius cruentus is a species of apho...,beetle
123845,100954,Dialytes,Dialytes is a genus of aphodiine dung beetles ...,beetle


In [9]:
# df.to_csv('vernacular_name.csv',index=False)