<a href="https://colab.research.google.com/github/Yhola/Discourse-Analysis-Tool/blob/main/Updated%20Discourse%20Analysis%20Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io

!pip install python-docx
!pip install spacy
!python -m spacy download en_core_web_sm
import docx
import re
import pandas as pd
import spacy

# Authenticate user and initialize Google Drive API
auth.authenticate_user()
service = build('drive', 'v3')

# Function to download and extract text from Google Docs files
def download_doc(file_id):
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    fh.seek(0)
    doc = docx.Document(fh)
    text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
    return text

# File IDs for spoken and written discourse
spoken_file_id = 'ADD YOUR FILE ID'
written_file_id = 'ADD YOUR FILE ID'

# Download and extract text from files
spoken_text = download_doc(spoken_file_id)
written_text = download_doc(written_file_id)

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Function to analyze linguistic features
def analyze_features(text):
    doc = nlp(text)
    features = {
        'Grammatical Complexity': len(re.findall(r'\b(because|since|when|so|if|as|while|where)\b', text, re.IGNORECASE)),
        'Lexical Density': len(re.findall(r'\b(\w+)\b', text)),
        'Nominalization': len([token for token in doc if token.tag_ == 'VBG' and token.dep_ == 'nsubj']),
        'Explicitness': len([token for token in doc if token.text.lower() in ['this', 'that', 'these', 'those', 'it', 'they', 'he', 'she']]),
        'Contextualization': len([token for token in doc if token.dep_ in ['nsubj', 'dobj', 'iobj'] and token.head.dep_ in ['advmod', 'acl', 'relcl']]),
        'Spontaneity': len(re.findall(r'\b(\w+|\w+ing)\b', text)),
        'Repetition and Redundancy': len(re.findall(r'\b(a|an|the|uh|um|you know|i mean|so that\'s why)\b', text, re.IGNORECASE))
    }
    return features

# Analyze both texts
spoken_features = analyze_features(spoken_text)
written_features = analyze_features(written_text)

# Creating dataframe to present results
columns = ['Linguistic Features', 'Constituents', 'Spoken Discourse', 'Written Discourse']

# Adding constituents
data = [
    ['Grammatical Complexity', 'Sentence structure: because, since, when, so, if, as, while, where', spoken_features['Grammatical Complexity'], written_features['Grammatical Complexity']],
    ['Lexical Density', 'Container words: verbs, nouns, adjectives, adverbs', spoken_features['Lexical Density'], written_features['Lexical Density']],
    ['Nominalization', 'Use of nouns derived from verbs or adjectives to make a concept more abstract, e.g., "swimming" as an activity rather than the action of "to swim"', spoken_features['Nominalization'], written_features['Nominalization']],
    ['Explicitness', 'Use of pronouns and demonstrative words (this, that, these, those) to specify referents clearly, indicating explicit reference to entities or concepts', spoken_features['Explicitness'], written_features['Explicitness']],
    ['Contextualization', 'Words and phrases that indicate contextual dependencies, such as referential or situational references requiring shared knowledge', spoken_features['Contextualization'], written_features['Contextualization']],
    ['Spontaneity', 'Grammatical and ungrammatical structures', spoken_features['Spontaneity'], written_features['Spontaneity']],
    ['Repetition and Redundancy', 'aas and umms, and more fillers like you know, I mean, so that\'s why', spoken_features['Repetition and Redundancy'], written_features['Repetition and Redundancy']]
]

df = pd.DataFrame(data, columns=columns)

# Save dataframe to CSV file
df.to_csv('linguistic_features_analysis.csv', index=True)

# Display the dataframe
from IPython.display import display
print("Linguistic Features Analysis:")
display(df)

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m109.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python

Unnamed: 0,Linguistic Features,Constituents,Spoken Discourse,Written Discourse
0,Grammatical Complexity,"Sentence structure: because, since, when, so, ...",182,34
1,Lexical Density,"Container words: verbs, nouns, adjectives, adv...",6772,2834
2,Nominalization,Use of nouns derived from verbs or adjectives ...,2,0
3,Explicitness,"Use of pronouns and demonstrative words (this,...",435,70
4,Contextualization,Words and phrases that indicate contextual dep...,138,59
5,Spontaneity,Grammatical and ungrammatical structures,6772,2834
6,Repetition and Redundancy,"aas and umms, and more fillers like you know, ...",452,183
