# Scraping

In [1]:
import pandas as pd
import re
import json

In [2]:

dict1 = pd.read_csv('electricity1.csv', delimiter='|')
dict1['definition'] = dict1['definition'].str.lower()
dict1['term'] = dict1['term'].str.lower()
dict1['term'] = dict1['term'].apply(lambda x: re.sub(r" ?\([^)]+\)", "", x))
dict1

Unnamed: 0,term,definition
0,alternating current,an electric current that reverses its directio...
1,ammeter,an instrument for measuring the flow of electr...
2,ampacity,the maximum amount of electric current a condu...
3,ampere-hour,a unit of measure for battery capacity. it is ...
4,ampere,a unit of measure for the intensity of an elec...
...,...,...
75,voltmeter,an instrument for measuring the force in volts...
76,watt-hour,a unit of electrical energy equivalent to a po...
77,watt,a unit of electrical power. one watt is equiva...
78,wattmeter,the wattmeter is an instrument for measuring t...


In [3]:

dict2 = pd.read_csv('electricity2.csv', delimiter='|')
dict2['definition'] = dict2['definition'].str.lower()
dict2['term'] = dict2['term'].str.lower()
dict2['term'] = dict2['term'].apply(lambda x: re.sub(r" ?\([^)]+\)", "", x))
dict2

Unnamed: 0,term,definition
0,actuator solenoid,the solenoid in the actuator housing on the ba...
1,alternator,a device which converts mechanical energy into...
2,alternating current,a flow of electrons which reverses its directi...
3,ambient temperature,"the temperature of the surrounding medium, suc..."
4,ammeter,an instrument for measuring the flow of electr...
...,...,...
167,wave,a signal that is produced by varying a continu...
168,waveform,a graphical representation ofelectrical cycles...
169,winding,the coiling of a wire about itself or about so...
170,wiring harness,thf3 trunk and branches which feed an electric...


In [4]:
full_dict = pd.concat([dict1, dict2])

In [5]:
full_dict[full_dict.duplicated(['term'])]

Unnamed: 0,term,definition
2,alternating current,a flow of electrons which reverses its directi...
4,ammeter,an instrument for measuring the flow of electr...
5,ampere,a unit of measure for the flow of current in a...
6,ampere-hour,a unit of measure for battery capacity. it is ...
11,armature,the movable part of a generator or motor. it i...
19,capacitor,a device which stores electrical energy. commo...
21,current,movement of electricity along a conductor. cur...
23,cycle,the change in an alternating electrical sine w...
28,diode,an electrical device that will allow current t...
29,direct current,a steady flow of electrons moving steadily and...


In [6]:
full_dict = full_dict.drop_duplicates(subset='term', keep="first")
full_dict

Unnamed: 0,term,definition
0,alternating current,an electric current that reverses its directio...
1,ammeter,an instrument for measuring the flow of electr...
2,ampacity,the maximum amount of electric current a condu...
3,ampere-hour,a unit of measure for battery capacity. it is ...
4,ampere,a unit of measure for the intensity of an elec...
...,...,...
163,voltage regulator,a device that controls the strength of a magne...
167,wave,a signal that is produced by varying a continu...
169,winding,the coiling of a wire about itself or about so...
170,wiring harness,thf3 trunk and branches which feed an electric...


# System text answer extraction

In [7]:
system_dict = []
term = ''
wiki_def = ''
between_quotes = r'\"(.+?)\"'

mult_substr_count = 0
no_wiki_entry_count = 0
skipped_unknown_count = 0
skipped_error_count = 0

with open('./Electricity/electricity.summary.txt') as file:
    complete_file = file.read()
sections = complete_file.split('*********************************************')

for section in sections:
    try:
        subsections = section.split('*************************************')
        term_section = subsections[1]
        definition_section = subsections[2]

        term = re.search(between_quotes, term_section).group(1)

        def_split = definition_section.split('\n\n')

        if def_split[1].startswith('Wikipedia First Paragraph for substrings of'):
            mult_substr_count += 1
        elif def_split[1].startswith('Wikipedia First Paragraph for'):
            definition = def_split[2]
        elif def_split[1].startswith('No Wikipedia Entry Found'):
            no_wiki_entry_count += 1
        else:
            skipped_unknown_count += 1

        system_dict.append((term,definition))
    except:
        print(section)
        skipped_error_count += 1
        
missed_total = mult_substr_count + no_wiki_entry_count + skipped_unknown_count
print(missed_total)

df = pd.DataFrame(system_dict, columns =['term', 'definition']) 
    
df
# between_quotes = r'\"(.+?)\"'
# m = re.search(between_quotes, test)
# print(m.group(1))




264


Unnamed: 0,term,definition
0,4th millennium bc,The 4th millennium BC spanned the years 4000 t...
1,a company founded by edison in germany,Allgemeine Elektricitäts-Gesellschaft AG (AEG)...
2,ac circuit,Allgemeine Elektricitäts-Gesellschaft AG (AEG)...
3,ac generator,"In electricity generation, a generator is a de..."
4,ac motor,An AC motor is an electric motor driven by an ...
...,...,...
945,working electrode,The working electrode is the electrode in an e...
946,workpiece,The working electrode is the electrode in an e...
947,world view,A world view or worldview is the fundamental c...
948,wye,A world view or worldview is the fundamental c...


In [8]:
reference = {}
for x in full_dict.to_dict('split')['data']:
    reference[x[0]] = x[1]

In [9]:
system = {}
left_out = 0
included = 0
for x in df.to_dict('split')['data']:
    if x[0] in reference: #check only the ones we have a definition for
        system[x[0]] = x[1]
        included +=1
    else:
        left_out+=1
        
print(len(reference))
print(len(system))
final_list = []
for x in system.keys():
#     obj = {
#         "reference" : {
#             'term': x,
#             'definition': reference[x]
#             },
#         "system": {
#             'term': x,
#             'definition': system[x]
#             }
#     }
    obj = {
        "reference" :  reference[x],
        "system": system[x]
    }

    final_list.append(obj)

with open('electricity' + '.txt', 'w') as outfile:
    json.dump(final_list, outfile)

for i, e in enumerate(final_list):
    if "(" in e['reference'] and "\n\n" in e['reference']:
        print(i,": ",e)


209
22


In [10]:
temp = {}
for x in df.to_dict('split')['data']:
    temp[x[0]] = x[1]

In [11]:
not_in_sys = 0
inc = 0
for x in full_dict.to_dict('split')['data']:
    if x[0] not in temp:
        not_in_sys += 1
        print(x[0])
    else:
        inc +=1
print(not_in_sys)
print(inc)

ammeter
ampacity
ampere
apparent power
armature
capacitor
circuit
conductor
corona
current
cycle
demand
dielectric constant
diode
electrolyte
electron
electron theory
farad
ferroresonance
frequency
fuse
generator
ground
ground fault circuit interrupters
henry
hertz
inductance
inductor
insulator
kilowatt-hour meter
kilowatt
load
load rejection
mutual induction
ohm
ohm's law
open circuit
parallel circuit
piezoelectricity
polarity
power
protective relay
rectifier
relay
reluctance
resistance
resistor
rotor
self induction
semiconductor
series-parallel circuit
service
short circuit
solid state circuit
transistor
true power
vars
variable resistor
volt-ampere
volt
voltage
voltmeter
watt-hour
watt
wattmeter
waveform
actuator solenoid
ambient temperature
amplifier
amplitude
analog ic
analog gauge
artificial magnets
atom
auxiliary speed sensor
bendix drive
break
brush
calibration
charge
current flow
cycling
diagnostic code
differentiator circuit
digital ic
discharge
distributor
distributor lead c

In [18]:
full_dict.to_dict('split')['data'][1][0]

'ammeter'

In [20]:
temp

{'4th millennium bc': 'The 4th millennium BC spanned the years 4000 through 3001 BC. Some of the major changes in human culture during this time included the beginning of the Bronze Age and the invention of writing, which played a major role in starting recorded history.',
 'a company founded by edison in germany': 'Allgemeine Elektricitäts-Gesellschaft AG (AEG) (German: "General electricity company") was a German producer of electrical equipment founded as the "Deutsche Edison-Gesellschaft für angewandte Elektricität" in 1883 in Berlin by Emil Rathenau. After World War II its headquarters moved to Frankfurt am Main.',
 'ac circuit': 'Allgemeine Elektricitäts-Gesellschaft AG (AEG) (German: "General electricity company") was a German producer of electrical equipment founded as the "Deutsche Edison-Gesellschaft für angewandte Elektricität" in 1883 in Berlin by Emil Rathenau. After World War II its headquarters moved to Frankfurt am Main.',
 'ac generator': 'In electricity generation, a g

# Calculus



In [21]:
import urllib.request
from bs4 import BeautifulSoup
import sys
import json

# topic = sys.argv[1].lower()
url = "https://en.wikipedia.org/wiki/Glossary_of_" + 'Calculus'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")


terms = soup.findAll('dt', class_='glossary')
definitions = soup.findAll('dd', class_='glossary')

assert(len(terms) == len(definitions))



In [71]:
wiki = {}

for i in range(len(terms)):
    wiki[terms[i].text.lower()] =  definitions[i].text.lower().replace('\n', '')
#     wiki_terms.append(terms[i].text.lower())
#     wiki_def.append(definitions[i].text.lower().replace('\n', ''))


187

In [63]:
system_dict = []
term = ''
wiki_def = ''
between_quotes = r'\"(.+?)\"'

mult_substr_count = 0
no_wiki_entry_count = 0
skipped_unknown_count = 0
skipped_error_count = 0

with open('./calculus.summary.txt') as file:
    complete_file = file.read()
sections = complete_file.split('*********************************************')

for section in sections:
    try:
        subsections = section.split('*************************************')
        term_section = subsections[1]
        definition_section = subsections[2]

        term = re.search(between_quotes, term_section).group(1)

        def_split = definition_section.split('\n\n')

        if def_split[1].startswith('Wikipedia First Paragraph for substrings of'):
            mult_substr_count += 1
        elif def_split[1].startswith('Wikipedia First Paragraph for'):
            definition = def_split[2]
        elif def_split[1].startswith('No Wikipedia Entry Found'):
            no_wiki_entry_count += 1
        else:
            skipped_unknown_count += 1

        system_dict.append((term,definition))
    except:
        print(section)
        skipped_error_count += 1
        
missed_total = mult_substr_count + no_wiki_entry_count + skipped_unknown_count
print("missed: ",missed_total)

df = pd.DataFrame(system_dict, columns =['term', 'definition']) 
    
df
# between_quotes = r'\"(.+?)\"'
# m = re.search(between_quotes, test)
# print(m.group(1))




missed:  174


Unnamed: 0,term,definition
0,ab,The zeta function of a mathematical operator f...
1,abel summation,"In mathematics, a divergent series is an infin..."
2,abelian theorem,"In mathematics, Abelian and Tauberian theorems..."
3,abscissa,"In mathematics, the abscissa (; plural ""abscis..."
4,accused leibniz,"In mathematics, the abscissa (; plural ""abscis..."
...,...,...
701,weierstrass substitution,"In integral calculus, the Weierstrass substitu..."
702,weierstrass theorem,Several theorems are named after Karl Weierstr...
703,wiener process,"In mathematics, the Wiener process is a real v..."
704,yukti-dipika,"In mathematics, the Wiener process is a real v..."


In [65]:
skipped = 0
for x in df['term'].values:
    if x not in wiki:
        print(x)
        skipped += 1
print(skipped)

ab
abel summation
abelian theorem
abscissa
accused leibniz
acta eruditorum
adequality
adjoint
alembert
alembert operator
algebraic equation
algebraic function
analytic continuation
analytic function
analytic solution
angle subtended
antidifferentiation
approximation method
archimede
arithmetic function
arithmetic mean
arithmetic operation
arithmetic progression
arithmetic-geometric mean
arithmetica infinitorum
astronomy kerala school
asymptotic behavior
asymptotic behaviour
asymptotic expansion
asymptotic series
augustin louis cauchy
augustin-louis cauchy
automorphic form
autonomous system
banach space
basel problem
basis vector
bernoulli
bessel function
binomial series
binomial theorem
bolzano
borel measure
borel summation
boundary condition
boundary layer
boundary term
boundary value problem
brownian motion
calculus fundamental theorem
calculus of moving surfaces
calculus state
calculus state fundamental theorem
canonical choice
canonical commutation
capacitor
cardinality
cartesian
c

In [73]:
wiki

{"abel's test": 'a method of testing for the convergence of an infinite series.',
 'absolute convergence': 'an infinite series of numbers is said to converge absolutely (or to be absolutely convergent) if the sum of the absolute values of the summands is finite. more precisely, a real or complex series ∑n=0∞an{\\displaystyle \\textstyle \\sum _{n=0}^{\\infty }a_{n}} is said to converge absolutely if ∑n=0∞|an|=l{\\displaystyle \\textstyle \\sum _{n=0}^{\\infty }\\left|a_{n}\\right|=l} for some real number l{\\displaystyle \\textstyle l}. similarly, an improper integral of a function, ∫0∞f(x)dx{\\displaystyle \\textstyle \\int _{0}^{\\infty }f(x)\\,dx}, is said to converge absolutely if the integral of the absolute value of the integrand is finite—that is, if ∫0∞|f(x)|dx=l.{\\displaystyle \\textstyle \\int _{0}^{\\infty }\\left|f(x)\\right|dx=l.}',
 'absolute maximum': '',
 'absolute minimum': '',
 'absolute value': 'the absolute value or modulus |x| of a real number\xa0x is the non-nega