In [1]:
import pandas as pd
import pyxlsb

In [2]:
df = pd.read_csv('paper_cites.csv')

In [13]:
df.dropna(subset=['afid'], inplace=True)

In [15]:
df.to_csv('paper_cites.csv')

# Join paper_cites and affils_w_startup

In [7]:
papers = pd.read_csv('paper_cites.csv')
affils = pd.read_csv('affils_w_startup.csv')

In [8]:
# Drop unnecessary columns
columns = ['eid', 'afid', 'source_id', 'cites']
papers = papers[columns]

In [9]:
columns = ['af_id', 'affil_name', 'affil_type', 'start_up']
affils = affils[columns]

In [13]:
# Change column name for merge
affils.rename(columns={'af_id': 'afid'}, inplace=True)

In [15]:
merged = pd.merge(papers, affils, how='left', on='afid')

In [21]:
merged.dropna(subset=['affil_type'], inplace=True)

In [24]:
merged['afid'] = merged['afid'].astype(int)

In [27]:
merged.to_csv('paper_affils.csv')

# Check merging with CiteScore

In [28]:
cite_score = pd.read_excel('CiteScore.xlsb', sheet_name='CiteScore 2011', engine='pyxlsb')

In [12]:
pd.options.display.max_columns = 50

In [29]:
columns = ['Scopus Source ID', 'CiteScore', 'SJR', 'Percentile', 'RANK', 'Rank Out Of', 'Quartile', 'Title']
cite_score = cite_score[columns].copy()

In [30]:
cite_score.head(1)

Unnamed: 0,Scopus Source ID,CiteScore,SJR,Percentile,RANK,Rank Out Of,Quartile,Title
0,12253,0.3,0.135,27,204,281,3,China Business Review


In [18]:
paper_affils = pd.read_csv('paper_affils.csv')

In [19]:
paper_affils.shape[0]

1044261

In [31]:
cite_score.rename(columns={'Scopus Source ID': 'source_id'}, inplace=True)

In [34]:
# Merging left creates more records as there are multiple records
# for each source_id in cite_score df
#papers_affils_journal_qual = pd.merge(paper_affils, cite_score, how='left', on='source_id')

In [40]:
cite_score['source_id'].to_csv('citescore_2011.csv')

In [44]:
paper_affils['source_id'].to_csv('source_ids_in_dataset.csv')

# Consolidate journal quality data

We currently have the following:

* `paper_affils.csv` with `source_id` variable.
* `CiteScore.xlsb` with CiteScore 2011 tab - this includes the different research areas for each source id. Therefore, this will need to be reduced to just one research area.
* `research_area.csv` with `source_id` - this maps one source id to one research area.

To complete the analysis, `source_id` from `paper_affils.csv` needs to be mapped onto `research_area.csv` to find one research area. Then, use the source id and research area to look up the corresponding journal quality metrics in `CiteScore.xlsb`.

__Steps:__

1. Map `source_id` from `paper_affils.csv` onto `research_area.csv`
2. Search `CiteScore.xlsb` for the corresponding source id and research area

# Consolidate CH start up location

We currently have the following:

* `affils_w_startup.csv` all affiliations which were able to be classified (48060) - contains `affil_city` from original Scopus data
* `scopus_affils_startup.csv` - affiliations that were (1) classifief as a company, (2) not classified using string matching, and (3) have an established or incorporated date in or after 1990 - contains `CH_city` and `CH_post_code` from Companies House when searching
    * NOTE: This csv file includes all matches - check `best_score > 85` for whether they were sent to FAME
    
It would be ideal if I could use `affil_city` as the first option for geographic data and then use `CH_city` if this does not work.

# Need fully consolidated data set!

__Creating one data set:__

1. Baseline is `paper_affils.csv` - DONE
2. Add journal quality from `journal_qual_consolidated.xlsx` `source_id` - get `percentile_asjc` - this can be concatenated without having to join as it was pulled directly from `paper_affils.csv` - DONE
3. Add start-up location from `start-up-location-consolidated-1.xlsx` `final_affils_startups`
4. Add est_inc_date from `start-up-location-consolidated-1.xlsx` `final_affils_startups`

In [73]:
paper_affils = pd.read_csv('paper_affils.csv')

In [75]:
journal_qual = pd.read_excel('journal_qual_consolidated.xlsx', sheet_name='source_id')

In [79]:
journal_qual.drop(columns=['asjc_auths', 'percentile_asjc_auths'], inplace=True)

In [83]:
papers_journals = pd.concat([paper_affils, journal_qual], axis=1)

In [93]:
papers_journals.head(1)

Unnamed: 0.1,Unnamed: 0,eid,afid,source_id,cites,affil_name,affil_type,start_up,source_id.1,asjc,CiteScore,SNIP,SCImago JR,percentile_asjc
0,0,2-s2.0-0034739787,60030480,14102,27.0,University of Bath,education,False,14102,1303,5.5,0.953,1.143,76.0


In [87]:
start_ups = pd.read_excel('start-up-location-consolidated-1.xlsx', sheet_name='final_affils_startups')

In [91]:
start_ups = start_ups[['af_id', 'post_area', 'est_inc_date']].copy()

In [94]:
start_ups.rename(columns={'af_id': 'afid'}, inplace=True)

In [95]:
papers_journals.shape

(1044261, 14)

In [98]:
papers_journals_startups = pd.merge(papers_journals, start_ups, how='left', on='afid')

In [100]:
papers_journals_startups.drop(columns=['Unnamed: 0'], inplace=True)

In [104]:
papers_journals_startups.to_csv('final.csv')

# Forgot research area

In [2]:
final = pd.read_csv('final.csv')

In [4]:
final.drop(columns=['source_id.1'], inplace=True)

In [6]:
final.head(1)

Unnamed: 0.1,Unnamed: 0,eid,afid,source_id,cites,affil_name,affil_type,start_up,asjc,CiteScore,SNIP,SCImago JR,percentile_asjc,post_area,est_inc_date
0,0,2-s2.0-0034739787,60030480,14102,27.0,University of Bath,education,False,1303,5.5,0.953,1.143,76.0,,


In [7]:
research_area = pd.read_csv('research_area.csv')

In [12]:
research_area.drop(columns=['asjc', 'asjc_source', 'area_auths', 'asjc_auths'], inplace=True)

In [13]:
research_area.head(1)

Unnamed: 0,source_id,area
0,10000153301,IMMU


In [14]:
final.shape

(1044261, 15)

In [15]:
final_2 = pd.merge(final, research_area, how='left', on='source_id')

In [16]:
final_2.shape

(1044261, 16)

In [20]:
final_2.area.isnull().value_counts()

False    1042614
True        1647
Name: area, dtype: int64

In [22]:
final_2.drop(columns=['Unnamed: 0'], inplace=True)

In [24]:
final_2.to_csv('final.csv')

# Forgot publication date

In [2]:
papers = pd.read_csv('papers.csv')

In [14]:
dates = papers[['eid', 'afid', 'coverDate']].copy()

In [8]:
final = pd.read_csv('final.csv')

In [15]:
final.shape

(1044261, 16)

In [17]:
on = ['eid', 'afid']
final_2 = pd.merge(final, dates, how='left', left_on=on, right_on=on)

In [20]:
final_2.drop(columns=['Unnamed: 0']).to_csv('final.csv')

# Format final

In [2]:
final = pd.read_csv('final_actual_start_up.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
final.head(1)

Unnamed: 0,Affil Name,Affil Type,Area,Eid,inc_year,Post Area,Source Id,start-up-candidate,Afid,Asjc,Cite Score,Cites,F1,Percentile Asjc,pub_year,SCImago JR,Snip,start-up
0,University of Southampton,education,ECON,2-s2.0-0034123014,,,23352,False,60025225,2002,5.0,21,65536,93.0,2000,3.615,2.512,0


In [8]:
o = ['Eid', 'pub_year', 'Afid', 'Affil Name', 'Post Area', 'inc_year', 'Affil Type', 
     'start-up', 'Source Id', 'Area', 'Cites', 'Cite Score', 'Percentile Asjc',
     'SCImago JR', 'Snip']
final_1 = final[o].copy()

In [10]:
rename = {'Eid': 'eid', 'Afid': 'afid', 'Affil Name': 'name',
          'Post Area': 'area_code', 'Affil Type': 'type',
          'Source Id': 'journal_id', 'Area': 'res_area',
          'Cites': 'cites_10_yrs', 'Cite Score': 'cs_2011',
          'Percentile Asjc': 'res_area_perc', 'SCImago JR': 'sci_jr',
          'Snip': 'snip'}
final_1.rename(columns=rename, inplace=True)

In [15]:
final_1.to_csv('final_tidy.csv')

In [16]:
final_1.head()

Unnamed: 0,eid,pub_year,afid,name,area_code,inc_year,type,start-up,journal_id,res_area,cites_10_yrs,cs_2011,res_area_perc,sci_jr,snip
0,2-s2.0-0034123014,2000,60025225,University of Southampton,,,education,0,23352,ECON,21,5.0,93.0,3.615,2.512
1,2-s2.0-0034120005,2000,60019665,Hannah Research Institute,,,resi,0,33928,AGRI,14,2.5,75.0,0.762,1.01
2,2-s2.0-0034110440,2000,60116446,University of Liverpool Management School,,,education,0,15807,SOCI,10,1.6,73.0,0.977,1.167
3,2-s2.0-0034105766,2000,60028485,University Hospitals of Leicester NHS Trust,,,healthcare,0,20994,MEDI,2,,,,
4,2-s2.0-0034101663,2000,60020849,Whipps Cross Hospital,,,healthcare,0,16616,MEDI,2,3.5,64.0,0.927,1.374


# Who are start-ups publishing computer science research?

In [2]:
df = pd.read_csv('final_tidy.csv')
affils = pd.read_csv('affils_w_startup.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
comp = df[(df['start-up'] == 1) & (df['res_area'] == 'COMP')].groupby(by='afid').count().sort_values(by='eid', ascending=False)

In [27]:
comp

Unnamed: 0_level_0,Unnamed: 0,eid,pub_year,name,area_code,inc_year,type,start-up,journal_id,res_area,cites_10_yrs,cs_2011,res_area_perc,sci_jr,snip
afid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
60098463,168,168,168,168,168,168,168,168,168,168,168,153,153,153,153
60100029,11,11,11,11,11,11,11,11,11,11,11,9,8,9,9
100322617,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10
100325034,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
100491623,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101638102,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0
101640424,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
101642236,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
101651246,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [29]:
df[df['afid'] == 60100029]

Unnamed: 0.1,Unnamed: 0,eid,pub_year,afid,name,area_code,inc_year,type,start-up,journal_id,res_area,cites_10_yrs,cs_2011,res_area_perc,sci_jr,snip
447722,447722,2-s2.0-0037357217,2003,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,25023,COMP,23,4.9,,0.784,1.865
447724,447724,2-s2.0-35248821334,2003,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,25674,COMP,4,1.3,55.0,0.338,0.778
447727,447727,2-s2.0-0346266314,2004,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,24998,COMP,12,2.5,70.0,0.59,1.406
447728,447728,2-s2.0-2942676751,2004,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,24998,COMP,4,2.5,70.0,0.59,1.406
447730,447730,2-s2.0-33749441021,2004,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,24972,COMP,11,9.5,99.0,1.414,3.603
507338,507338,2-s2.0-13844296429,2005,60100029,Numerical Geometry Ltd.,CB,1994.0,company,0,28041,MATH,2,2.8,80.0,1.006,1.39
534050,534050,2-s2.0-0036976663,2002,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,25572,COMP,4,,,,
534051,534051,2-s2.0-0036627460,2002,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,25572,COMP,10,,,,
534052,534052,2-s2.0-0036609139,2002,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,26146,COMP,16,2.5,53.0,0.31,1.103
534053,534053,2-s2.0-0036685219,2002,60100029,Numerical Geometry Ltd.,CB,1994.0,company,1,26146,COMP,17,2.5,53.0,0.31,1.103
