In [None]:
from google.colab import drive

drive.mount('/content/drive')

import itertools
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import numpy as np
import re
import math
import pandas as pd
import networkx as nx
Subjects = {}

file_path = "/content/drive/My Drive/Colab Notebooks/14-Indices (2).txt"
file_path2 = '/content/drive/My Drive/Colab Notebooks/FinalizedSummaries.csv'
df = pd.read_csv(file_path2)
df = df.iloc[:300]

NetworkNodes = {"Lay Summary": {}, "Technical Summary": {}}
NetworkEdges = {"Lay Summary": {}, "Technical Summary": {}}
NetworkDegree = {"Lay Summary" : {}, "Technical Summary": {}}
NetworkDensity = {"Lay Summary": {}, "Technical Summary": {}}
NetworkAvgClustering = {"Lay Summary": {}, "Technical Summary": {}}
NetworkLargestComponent = {"Lay Summary": {}, "Technical Summary": {}}
NetworkDegreeWeighted = {"Lay Summary": {}, "Technical Summary": {}}

graph_measures = [
    NetworkNodes,
    NetworkEdges,
    NetworkDegree,
    NetworkDensity,
    NetworkAvgClustering,
    NetworkLargestComponent,
    NetworkDegreeWeighted
]

graph_measure_names = [
    "NetworkNodes",
    "NetworkEdges",
    "Network Degree",
    "NetworkDensity",
    "NetworkAvgClustering",
    "NetworkLargestComponent",
    "NetworkDegreeWeighted"
]

def build_pmi_graph(pmi_scores, all_nodes):
    G = nx.Graph()
    G.add_nodes_from(all_nodes)
    for (w1, w2), weight in pmi_scores.items():
        G.add_edge(w1, w2, weight=weight)
    return G

def analyze_graph(G, words):

    connected_nodes = [node for node, degree in G.degree()if degree > 0]

    return {
        'nodes per words': len(connected_nodes) / words,
        'edges': int(G.number_of_edges()),
        'avg_degree' : sum(dict(G.degree()).values()) / G.number_of_nodes(),
        'density': nx.density(G.subgraph(connected_nodes)),
        'average clustering': nx.average_clustering(G.subgraph(connected_nodes), weight='weight'),
        'largest_component_size':len(max(nx.connected_components(G), key=len)) / G.number_of_nodes(),
        'avg_degree_weighted': sum(dict(G.subgraph(connected_nodes).degree(weight='weight')).values()) / len(connected_nodes)

    }

def findNetworkComplexity(Column):

    count = 0

    for summary in df[Column]:
        count += 1
        wordsDict = {}

        wordsList = [word.lower() for word in re.findall(r'\b[a-zA-Z]+\b', summary)]
        sentences = re.split(r'(?<=[.!?])\s+', summary)

        for word in wordsList:
            if word in wordsDict:
                wordsDict[word] += 1
            else:
                wordsDict[word] = 1

        keys = list(wordsDict.keys())
        pairsDict = {pair: 0 for pair in itertools.combinations(keys, 2)}

        for pair in pairsDict:
            w1, w2 = pair
            w1Freq = 0
            w2Freq = 0
            mutualOccurences = 0
            numSentences = len(sentences)

            for word in wordsList:
              if word == w1:
                w1Freq += 1
              elif word == w2:
                w2Freq += 1

            for sentence in sentences:
                if w1 in sentence and w2 in sentence:
                  mutualOccurences += 1

            pW1 = w1Freq / len(wordsList)
            pW2 = w2Freq / len(wordsList)
            pW1W2 = mutualOccurences / numSentences

            if pW1W2 > 0:
                pairsDict[pair] = math.log2(pW1W2 / (pW1 * pW2))

        refinedPairsDict = {}

        for pair in pairsDict:

          if pairsDict[pair] != 0:

            refinedPairsDict[pair] = pairsDict[pair]

        graphMetrics = analyze_graph(build_pmi_graph(refinedPairsDict, set(wordsList)), len(wordsList))

        NetworkNodes[Column][count] = graphMetrics['nodes per words']
        NetworkEdges[Column][count] = graphMetrics['edges']
        NetworkDegree[Column][count] = graphMetrics['avg_degree']
        NetworkDensity[Column][count] = graphMetrics['density']
        NetworkAvgClustering[Column][count] = graphMetrics['average clustering']
        NetworkLargestComponent[Column][count] = graphMetrics['largest_component_size']
        NetworkDegreeWeighted[Column][count] = graphMetrics['avg_degree_weighted']


    graph_measures = [
    NetworkNodes,
    NetworkEdges,
    NetworkDegree,
    NetworkDensity,
    NetworkAvgClustering,
    NetworkLargestComponent,
    NetworkDegreeWeighted

    ]


findNetworkComplexity("Lay Summary")
findNetworkComplexity("Technical Summary")
lineNum = 0
laySummaryCount = 0
techSummaryCount = 0
subjectLabels = {}

with open(file_path, "r") as file:

    for line in file:

        #Document all the subjects of the summaries
        lineNum += 1
        if lineNum % 3 == 1:
          subject = line.strip().lower()
          if subject in Subjects.keys():
            Subjects[subject] += 1
          else:
            Subjects[subject] = 1
        else:
          # Remove trailing newline and whitespace
          preValues = [val.strip() for val in line.strip().split(",")]
          values = []
          for value in preValues:
            values.append(float(value))

          if lineNum % 3 == 2:

            laySummaryCount += 1
            subjectLabels[laySummaryCount] = subject.strip().lower()

          else:

            techSummaryCount += 1


physicalSciences = ['environmental sciences', 'earth, atmospheric, and planetary sciences', 'applied physical sciences', 'computer sciences','applied mathematics','engineering','chemistry','statistics','sustainability science','physics','astronomy']
biologicalSciences = ['biophysics and computational biology','immunology and inflammation','neuroscience', 'genetics', 'ecology', 'medical sciences','evolution','plant biology','agricultural sciences','applied biological sciences','biochemistry','developmental biology','systems biology','microbiology','pharmacology','cell biology','physiology','population biology']
socialSciences = ['economic sciences','psychological and cognitive sciences','demography','political sciences','social sciences','anthropology','']

umbrellas = {
    'physicalSciences': physicalSciences,
    'biologicalSciences': biologicalSciences,
    'socialSciences': socialSciences
}

#note: all instances of q2 should be q3, all instances of mean should be median

MasterDictionary3 = {'NetworkNodes' : NetworkNodes, 'NetworkEdges' : NetworkEdges,'NetworkDegree' : NetworkDegree, 'NetworkDensity' : NetworkDensity, 'NetworkAvgClustering' : NetworkAvgClustering, 'NetworkLargestComponent' : NetworkLargestComponent, 'NetworkDegreeWeighted' : NetworkDegreeWeighted }

bioQ1Q2Mean = {"NetworkNodes":{'q1':0,'q2':0}, 'NetworkEdges': {'q1':0,'q2':0}, 'NetworkDegree' : {'q1':0,'q2':0}, 'NetworkDensity' : {'q1':0,'q2':0}, 'NetworkAvgClustering' : {'q1':0,'q2':0}, 'NetworkLargestComponent' : {'q1':0,'q2':0}, 'NetworkDegreeWeighted': {'q1':0,'q2':0}}
physQ1Q2Mean = {"NetworkNodes":{'q1':0,'q2':0}, 'NetworkEdges': {'q1':0,'q2':0}, 'NetworkDegree' : {'q1':0,'q2':0}, 'NetworkDensity' : {'q1':0,'q2':0}, 'NetworkAvgClustering' : {'q1':0,'q2':0}, 'NetworkLargestComponent' : {'q1':0,'q2':0}, 'NetworkDegreeWeighted': {'q1':0,'q2':0}}
sociQ1Q2Mean = {"NetworkNodes":{'q1':0,'q2':0}, 'NetworkEdges': {'q1':0,'q2':0}, 'NetworkDegree' : {'q1':0,'q2':0}, 'NetworkDensity' : {'q1':0,'q2':0}, 'NetworkAvgClustering' : {'q1':0,'q2':0}, 'NetworkLargestComponent' : {'q1':0,'q2':0}, 'NetworkDegreeWeighted': {'q1':0,'q2':0}}

for key in MasterDictionary3.keys():
  bio = []
  phys = []
  soci = []

  for i in range(1,301):

    if subjectLabels[i] in physicalSciences:
      phys.append(MasterDictionary3[key]["Technical Summary"][i])
    elif subjectLabels[i] in biologicalSciences:
      bio.append(MasterDictionary3[key]["Technical Summary"][i])
    elif subjectLabels[i] in socialSciences:
      soci.append(MasterDictionary3[key]["Technical Summary"][i])

  bioQ1Q2Mean[key]['q1'] = np.percentile(np.fromiter(bio, dtype=float),25)
  bioQ1Q2Mean[key]['q2'] = np.percentile(np.fromiter(bio, dtype=float),75)
  bioQ1Q2Mean[key]['mean'] = np.median(np.fromiter(bio,dtype=float))

  physQ1Q2Mean[key]['q1'] = np.percentile(np.fromiter(phys, dtype=float),25)
  physQ1Q2Mean[key]['q2'] = np.percentile(np.fromiter(phys, dtype=float),75)
  physQ1Q2Mean[key]['mean'] = np.median(np.fromiter(phys,dtype=float))

  sociQ1Q2Mean[key]['q1'] = np.percentile(np.fromiter(soci, dtype=float),25)
  sociQ1Q2Mean[key]['q2'] = np.percentile(np.fromiter(soci, dtype=float),75)
  sociQ1Q2Mean[key]['mean'] = np.median(np.fromiter(soci,dtype=float))

results = {}

for pair in itertools.combinations(umbrellas.keys(), 2):
    results[pair] = {}

    subjOne, subjTwo = pair

    for name, measure in zip(graph_measure_names, graph_measures):
        subOneValues = []
        subTwoValues = []

        for i in range(1, len(subjectLabels) + 1):
            subject = subjectLabels.get(i, "").lower()
            if subject in umbrellas[subjOne]:
                if i in measure["Technical Summary"]:
                    subOneValues.append(measure["Technical Summary"][i])
            elif subject in umbrellas[subjTwo]:
                if i in measure["Technical Summary"]:
                    subTwoValues.append(measure["Technical Summary"][i])

        t_stat, p_val = ttest_ind(subOneValues, subTwoValues)
        results[pair][name] = {'t': t_stat, 'p': p_val}

# Print significant results
for pair, metrics in results.items():
    for metric_name, stats in metrics.items():
        if np.isnan(stats['p']):
            print(f"No data for {metric_name} in comparison {pair}")
        elif stats['p'] < 0.05:
            print(f"Significant difference in {metric_name} for {pair}: t = {stats['t']:.3f}, p = {stats['p']:.4f}")
        elif stats['p'] < 0.1:
            print(f"Weak difference in {metric_name} for {pair}: t = {stats['t']:.3f}, p = {stats['p']:.4f}")
        else:
            print(f"No significant difference in {metric_name} for {pair}: p = {stats['p']:.4f}, t = {stats['t']:.3f}")


#print(bioQ1Q2Mean)
#print(physQ1Q2Mean)
#print(sociQ1Q2Mean)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
No significant difference in NetworkNodes for ('physicalSciences', 'biologicalSciences'): p = 0.6227, t = 0.493
No significant difference in NetworkEdges for ('physicalSciences', 'biologicalSciences'): p = 0.2882, t = 1.065
Significant difference in Network Degree for ('physicalSciences', 'biologicalSciences'): t = 2.026, p = 0.0439
Weak difference in NetworkDensity for ('physicalSciences', 'biologicalSciences'): t = 1.871, p = 0.0627
No significant difference in NetworkAvgClustering for ('physicalSciences', 'biologicalSciences'): p = 0.5464, t = 0.604
Weak difference in NetworkLargestComponent for ('physicalSciences', 'biologicalSciences'): t = 1.774, p = 0.0774
Weak difference in NetworkDegreeWeighted for ('physicalSciences', 'biologicalSciences'): t = 1.874, p = 0.0622
No significant difference in NetworkNodes for ('physicalSciences', 'socialSciences'): p 