In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [14]:
# defining functions
def collect_tag_links():
    '''
    collects all complete links to different intership category pages such as:
    internships in Delhi, Finance internships, Tech internships, internships in chandigarh etc.
    
    OUPUT:
    tag_links - returns all links to different present on www.letsintern.com 
    '''
    tag_links = []
    
    url = requests.get('https://www.letsintern.com/')
    data = url.text
    soup = BeautifulSoup(data)
    
    tags = soup.find_all('li',attrs = {'class':'col-sm-4'})
    for i in tags:
        hrefs = i.find_all('a')
        hrefs = ['http://letsintern.com' + j['href'] for j in hrefs]
        tag_links.extend(hrefs)
        
    return tag_links

def find_links_main(links_list):
    '''
    finds links to specific internship pages from pages where different internship profiles are listed. 
    Eg : www.letsintern.com/interships/IT-internships
    
    INPUT:
    links_list - a list of complete links that refer to different categorial pages
    
    OUPUT:
    collected_links - a list of collected links from all the pages in links_list 
    '''
    collected_links = []
    n = 1
    
    for link in links_list:
        url = requests.get(link)
        data = url.text
        soup = BeautifulSoup(data)
        
        links = soup.find_all('div', attrs = {'class':'job-title'})
        
        collected_links.extend(['http://letsintern.com' + i.a['href'] for i in links if 'letsintern' not in i.a['href']])
        
        print(n/len(links_list))
        n+= 1
    collected_links = list(set(collected_links))
    return collected_links

def find_links_internship(links_list):
    '''
    finds the links present on an internship profile page. These links point to a company 
    page where more links are posted. 
    Also, the links collected will be similar to the internship on that page as these themselves are the 
    recommendations by letsintern.
    
    INPUT:
    links_list - a list of complete links that refer to specific internship pages
    
    OUTPUT:
    collected_links - a list of collected links from all the pages in links_list after removing the 
                      links in the input links_list(ensuring we aren't returning duplicates).
                      
    '''
    collected_links = []
    n = 1
    
    for link in links_list:
        url = requests.get(link)
        data = url.text
        soup = BeautifulSoup(data)
        
        links = soup.find_all('div',attrs={'class':"col-sm-9 col-xs-9"})
        
        collected_links.extend(['http://letsintern.com' + i.a['href'] for i in links])
        
        print(n/len(links_list))
        n+= 1
    collected_links = list(set(collected_links))
    return collected_links

def find_links_company(links_list):
    '''
    finds the links to different internship profiles present on an internship company page. 
    
    INPUT:
    links_list - a list of complete links that refer to specific company pages
    
    OUTPUT:
    collected_links - a list of collected links from all the pages in links_list after removing the 
                      links in links_list.
    '''
    collected_links = []
    n = 1
                           
    for link in links_list:
        url = requests.get(link)
        data = url.text
        soup = BeautifulSoup(data)
                           
        links = soup.find_all('div',attrs = {'class':'job-title'})
                           
        for i in links:
            try:
                collected_links.append('http://letsintern.com'+ i.a['href'])
            except:
                print(i.a)
                continue
        print(n/len(links_list))
        n+= 1
    return list(set(collected_links))

def extract_data(links_list):
    '''
    extracts all the relevant data needed from each of the links and returns and saves a 
    dataframe containing all that information.
    
    INPUT:
    links_list - a list of complete links that refer to specific internship pages
    
    OUTPUT:
    df - a dataframe with the rows as the links and the columns as the information extracted
    
    '''
    job_title = []
    company_name = []
    job_loc = []
    details = []
    category = []
    compensation =[]
    start = []
    end = []
    skills = []
    hrefs = []
    n = 1
    
    for link in links_list:
        url = requests.get(link)
        data = url.text
        soup = BeautifulSoup(data) 

        try :
            # many job titles were not given as the pages didn't exist themselves
            job_title.append(soup.find_all('div',attrs={'class':'job-title'})[0].text)
        except:
            print(link)
            links_list.remove(link)
            continue # continue breaks the current iteration of the loop and jumps to the next one
        try:                 
            company_name.append(soup.find_all('div', attrs ={'class':'company-name'})[0].text)
        except:
            print(soup.find_all('div', attrs ={'class':'company-name'}))
            company_name.append('no company found')
        try:
            job_loc.append(soup.find_all('div', attrs ={'class':'job-locations'})[0].text)
        except:
            print(soup.find_all('div', attrs ={'class':'job-locations'}))
            job_loc.append('no job location found')
        try:
            details.append(soup.find_all('div', attrs ={'class':'details-section fixht'})[0].text)
        except:
            print(soup.find_all('div', attrs ={'class':'details-section fixht'}))
            details.append('no details found')
        try:  
            category.append(soup.find_all('a', attrs= {'title':'Internship Category'})[0].text)
        except:
            print(soup.find_all('a', attrs= {'title':'Internship Category'}))
            category.append('no category found')
        try:
            compensation.append(soup.find_all('a', attrs= {'title':'Compensation Type'})[0].text)
        except: 
            print(soup.find_all('a', attrs= {'title':'Compensation Type'}))
            compensation.append('no compensation found')
        try:
            start.append(soup.find_all('li', attrs = {'title':'Start Date'})[0].text)
        except:
            print(soup.find_all('li', attrs = {'title':'Start Date'}))
            start.append('no start date found')
        try:    
            end.append(soup.find_all('li', attrs = {'title':'End Date'})[0].text)
        except:
            print(soup.find_all('li', attrs = {'title':'End Date'}))
            end.append('no end date found')
        try:
            skills.append(soup.find_all('div', attrs = {'id':'skills-required'})[0].text)
        except: 
            print(soup.find_all('div', attrs = {'id':'skills-required'}))
            skills.append('no skills found')
        hrefs.append(link)
        print(n/len(links_list))
        n+=1
       

    df = pd.DataFrame({'href':hrefs, 'job_title':job_title, 'company_name':company_name, 'job_loc':job_loc
                      ,'details':details, 'category':category, 'compensation':compensation, 'start':start
                      ,'end':end, 'skills':skills})
    df.to_csv('../data_for_notebooks/information_from_links.csv')
    return df


In [15]:
tag_links = collect_tag_links()

In [16]:
saved_links = find_links_main(tag_links)


0.03571428571428571
0.07142857142857142
0.10714285714285714
0.14285714285714285
0.17857142857142858
0.21428571428571427
0.25
0.2857142857142857
0.32142857142857145
0.35714285714285715
0.39285714285714285
0.42857142857142855
0.4642857142857143
0.5
0.5357142857142857
0.5714285714285714
0.6071428571428571
0.6428571428571429
0.6785714285714286
0.7142857142857143
0.75
0.7857142857142857
0.8214285714285714
0.8571428571428571
0.8928571428571429
0.9285714285714286
0.9642857142857143
1.0


In [None]:
saved_links_1 = find_links_internship(saved_links)
saved_links_2 = find_links_company(saved_links_1)
final_links = list(set(saved_links_2) | set(saved_links))
len(final_links)
df = extract_data(final_links)


0.006493506493506494
0.012987012987012988
0.01948051948051948
0.025974025974025976
0.032467532467532464
0.03896103896103896
0.045454545454545456
0.05194805194805195
0.05844155844155844
0.06493506493506493
0.07142857142857142
0.07792207792207792
0.08441558441558442
0.09090909090909091
0.09740259740259741
0.1038961038961039
0.11038961038961038
0.11688311688311688
0.12337662337662338
0.12987012987012986
0.13636363636363635
0.14285714285714285
0.14935064935064934
0.15584415584415584
0.16233766233766234
0.16883116883116883
0.17532467532467533
0.18181818181818182
0.18831168831168832
0.19480519480519481
0.2012987012987013
0.2077922077922078
0.21428571428571427
0.22077922077922077
0.22727272727272727
0.23376623376623376
0.24025974025974026
0.24675324675324675
0.2532467532467532
0.2597402597402597
0.2662337662337662
0.2727272727272727
0.2792207792207792
0.2857142857142857
0.2922077922077922
0.2987012987012987
0.3051948051948052
0.3116883116883117
0.3181818181818182
0.3246753246753247
0.33116883

0.8892508143322475
0.8925081433224755
0.8957654723127035
0.8990228013029316
0.9022801302931596
0.9055374592833876
0.9087947882736156
0.9120521172638436
0.9153094462540716
0.9185667752442996
0.9218241042345277
0.9250814332247557
0.9283387622149837
0.9315960912052117
0.9348534201954397
0.9381107491856677
0.9413680781758957
0.9446254071661238
0.9478827361563518
0.9511400651465798
0.9543973941368078
0.9576547231270358
0.9609120521172638
0.9641693811074918
0.9674267100977199
0.9706840390879479
0.9739413680781759
0.9771986970684039
0.9804560260586319
0.9837133550488599
0.9869706840390879
0.990228013029316
0.993485342019544
0.996742671009772
1.0
0.0017889087656529517
0.0035778175313059034
0.005366726296958855
0.007155635062611807
0.008944543828264758
0.01073345259391771
0.012522361359570662
0.014311270125223614
0.016100178890876567
0.017889087656529516
0.01967799642218247
0.02146690518783542
0.023255813953488372
0.025044722719141325
http://letsintern.com/internship/Content-Development-Profess

0.5575868372943327
0.5594149908592322
0.5612431444241316
0.5630712979890311
0.5648994515539305
0.56672760511883
0.5685557586837294
0.5703839122486288
0.5722120658135283
0.5740402193784278
0.5758683729433273
0.5776965265082267
0.5795246800731262
0.5813528336380256
0.583180987202925
0.5850091407678245
0.5868372943327239
0.5886654478976234
0.5904936014625228
0.5923217550274223
0.5941499085923218
0.5959780621572212
0.5978062157221207
0.5996343692870201
0.6014625228519196
0.603290676416819
0.6051188299817185
0.6069469835466179
0.6087751371115173
http://letsintern.com/internship/Content-Development-Professional-internships/Aashna-Malani/Content-Writer/66329
0.6117216117216118
0.6135531135531136
0.6153846153846154
0.6172161172161172
0.6190476190476191
0.6208791208791209
0.6227106227106227
0.6245421245421245
0.6263736263736264
0.6282051282051282
0.63003663003663
0.6318681318681318
0.6336996336996337
0.6355311355311355
0.6373626373626373
0.6391941391941391
0.6410256410256411
0.6428571428571429
